In [None]:
pip install thefuzz

Collecting thefuzz
  Downloading thefuzz-0.20.0-py3-none-any.whl (15 kB)
Collecting rapidfuzz<4.0.0,>=3.0.0 (from thefuzz)
  Downloading rapidfuzz-3.5.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: rapidfuzz, thefuzz
Successfully installed rapidfuzz-3.5.2 thefuzz-0.20.0


In [None]:
import pandas as pd
from thefuzz import process


In [None]:
file_path = '/content/Base Capital IQ_completa_24112023.xls'
data = pd.read_excel(file_path, sheet_name='Screening')

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 35 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   Company Name                                           10000 non-null  object
 1   Exchange:Ticker                                        10000 non-null  object
 2   Geographic Locations                                   10000 non-null  object
 3   State/Region From Primary Address                      10000 non-null  object
 4   Company Type                                           10000 non-null  object
 5   Primary Industry                                       10000 non-null  object
 6   Primary Sector                                         10000 non-null  object
 7   Short Business Description                             10000 non-null  object
 8   Total Revenue [FY 2018] (BRLmm, Historical rate)       10

In [None]:
data = data.applymap(lambda x: x.strip().lower() if isinstance(x, str) else x)


In [None]:
# Supondo que 'nome_empresa' seja a coluna com os nomes das empresas
for nome in data['Company Name']:
    matches = process.extractBests(nome, data['Company Name'], limit=2)
    for match in matches:
        if match[1] > 90:  # 90 é um exemplo de limiar de similaridade
            print(f"Duplicata encontrada: {nome} e {match[0]}")


[1;30;43mA saída de streaming foi truncada nas últimas 5000 linhas.[0m
Duplicata encontrada: laticínios marilia s/a e laticínios marilia s/a
Duplicata encontrada: laticínios matinal ltda e laticínios matinal ltda
Duplicata encontrada: laticinios nituano s/a e laticinios nituano s/a
Duplicata encontrada: laticínios porto alegre ind. e com. ltda. e laticínios porto alegre ind. e com. ltda.
Duplicata encontrada: laticínios são vicente de minas s.a. e laticínios são vicente de minas s.a.
Duplicata encontrada: laticínios starmilk s/a e laticínios starmilk s/a
Duplicata encontrada: latin sports s.a. e latin sports s.a.
Duplicata encontrada: latin sports s.a. e latin sports s.a.
Duplicata encontrada: latin sports s.a. e latin sports s.a.
Duplicata encontrada: latin sports s.a. e latin sports s.a.
Duplicata encontrada: latina manutenção de rodovias ltda. e latina manutenção de rodovias ltda.
Duplicata encontrada: latina sinalização de rodovias ltda. e latina sinalização de rodovias ltda.
Dup

In [None]:
de_para = []

for nome in data['Company Name'].unique():
    matches = process.extractBests(nome, data['Company Name'].unique(), limit=2)
    for match in matches:
        if match[1] > 95 and nome != match[0]:
            de_para.append((nome, match[0]))

de_para_df = pd.DataFrame(de_para, columns=['Original', 'Duplicata'])


In [None]:
de_para_df.head()

Unnamed: 0,Original,Duplicata
0,7n administração e participações s/a,njh administração e participações s.a.
1,ads er eólica corredor do senandes ii s.a.,ads er eólica corredor do senandes iii s.a.
2,ads er eólica corredor do senandes iii s.a.,ads er eólica corredor do senandes ii s.a.
3,agência de vapores grieg s.a.,agência de vapores grieg s/a
4,agência de vapores grieg s/a,agência de vapores grieg s.a.


In [None]:
de_para_df.to_excel('relacao_de_para.xlsx', index=False)


In [None]:
# Extracting the relevant columns for Revenue and SG&A
# Revenue columns are I to M (indexes 8 to 12) and SG&A columns are S to W (indexes 18 to 22)

# Extracting and renaming the columns for better readability
revenue_columns = data.iloc[:, 8:13]
revenue_columns.columns = ['Revenue 2018', 'Revenue 2019', 'Revenue 2020', 'Revenue 2021', 'Revenue 2022']

sga_columns = data.iloc[:, 18:23]
sga_columns.columns = ['SGA 2018', 'SGA 2019', 'SGA 2020', 'SGA 2021', 'SGA 2022']

# Combining the extracted columns with the company names for context
company_names = data.iloc[:, 0]
combined_data = pd.concat([company_names, revenue_columns, sga_columns], axis=1)

# Removing the header row from the data
combined_data = combined_data.iloc[1:]

# Convert the revenue and SG&A columns to numeric values, handling non-numeric entries
for col in combined_data.columns[1:]:
    combined_data[col] = pd.to_numeric(combined_data[col], errors='coerce')

# Now we have the cleaned data with company names, revenue, and SG&A
combined_data.head()


Unnamed: 0,Company Name,Revenue 2018,Revenue 2019,Revenue 2020,Revenue 2021,Revenue 2022,SGA 2018,SGA 2019,SGA 2020,SGA 2021,SGA 2022
1,2.0 hotéis holding arco ltda.,,,,,,,,,,
2,2008 empreendimentos comerciais s.a.,,25.2,20.5,,,,0.75,2.86,,
3,220 capital investimentos e participações s.a.,,31.1,,,,,9.46,,,
4,2bcapital s.a.,,2.54,2.85,,,,7.06,9.12,,
5,2im inteligencia medica s a,2.49,,,,,2.08,,,,


In [None]:
# Defining the classification function based on the latest available revenue
def classify_company(revenue_series):
    # Getting the latest available revenue
    latest_revenue = revenue_series.dropna().max()

    # Classifying based on the provided criteria
    if latest_revenue <= 0.3:
        return 'Small Company'
    elif latest_revenue < 1.0:
        return 'Mid Company'
    else:
        return 'Large Company'

# Applying the classification
combined_data['Company Classification'] = combined_data[['Revenue 2018', 'Revenue 2019', 'Revenue 2020', 'Revenue 2021', 'Revenue 2022']].apply(classify_company, axis=1)

# Filtering out companies that are medium or large enterprises
medium_large_companies = combined_data[combined_data['Company Classification'].str.contains('Mid | Large | Small')]

medium_large_companies.head()  # Displaying the first few medium or large companies


Unnamed: 0,Company Name,Revenue 2018,Revenue 2019,Revenue 2020,Revenue 2021,Revenue 2022,SGA 2018,SGA 2019,SGA 2020,SGA 2021,SGA 2022,Company Classification
32,a.h.p. - gerenciamento patrimonial s/a,0.733,0.469,,,,0.369,0.192,,,,Mid Company
65,ac participacoes sa,0.969,0.806,,,,0.982,1.05,,,,Mid Company
139,administração e representações telles s.a.,0.651,,,,,0.256,,,,,Mid Company
141,administradora ipanema s.a.,0.549,0.969,0.886,,,0.887,0.795,0.648,,,Mid Company
255,agro florestal são caetano s.a.,0.021,0.577,0.451,,,0.26,0.271,0.329,,,Mid Company


In [None]:
# Calculating SG&A/Revenue ratio for each year
for year in range(2018, 2023):
    revenue_col = f'Revenue {year}'
    sga_col = f'SGA {year}'
    ratio_col = f'SG&A/Revenue {year}'
    combined_data[ratio_col] = combined_data[sga_col] / combined_data[revenue_col]

# Keeping only the relevant columns for trend analysis
trend_columns = ['Company Name', 'Company Classification'] + [f'SG&A/Revenue {year}' for year in range(2018, 2023)]
trend_analysis_data = combined_data[trend_columns]

# Displaying the first few rows for the trend analysis
trend_analysis_data.head()


Unnamed: 0,Company Name,Company Classification,SG&A/Revenue 2018,SG&A/Revenue 2019,SG&A/Revenue 2020,SG&A/Revenue 2021,SG&A/Revenue 2022
1,2.0 hotéis holding arco ltda.,Large Company,,,,,
2,2008 empreendimentos comerciais s.a.,Large Company,,0.029762,0.139512,,
3,220 capital investimentos e participações s.a.,Large Company,,0.30418,,,
4,2bcapital s.a.,Large Company,,2.779528,3.2,,
5,2im inteligencia medica s a,Large Company,0.835341,,,,


In [None]:
import numpy as np

# Defining a function to identify increasing trend in SG&A/Revenue ratio
def identify_increasing_trend(row):
    # Extracting the SG&A/Revenue values and dropping NaNs
    values = row[[f'SG&A/Revenue {year}' for year in range(2018, 2023)]].dropna()

    # Checking if there are at least two values to compare
    if len(values) < 2:
        return "Insufficient Data"

    # Checking for an increasing trend
    if np.all(np.diff(values) > 0):
      return "Aumento"

    elif np.all(np.diff(values) == 0):
      return "Estabilidade"

    else:
      return "Redução"

# Applying the function to identify trends
trend_analysis_data['Trend in SG&A/Revenue'] = trend_analysis_data.apply(identify_increasing_trend, axis=1)

# Preparing the final data for export
final_export_data = trend_analysis_data.rename(columns={'2bCapital S.A.': 'Company Name'})

# Saving the final data to an Excel file
final_export_data.to_excel('trend_sga.xlsx', index=False)


  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[slice2])
  if np.all(np.diff(values) > 0):
  a = op(a[slice1], a[sli

In [None]:
trend_analysis_data

Unnamed: 0,Company Name,Company Classification,SG&A/Revenue 2018,SG&A/Revenue 2019,SG&A/Revenue 2020,SG&A/Revenue 2021,SG&A/Revenue 2022,Trend in SG&A/Revenue
32,a.h.p. - gerenciamento patrimonial s/a,Mid Company,0.503411,0.409382,,,,Redução
65,ac participacoes sa,Mid Company,1.013416,1.302730,,,,Aumento
139,administração e representações telles s.a.,Mid Company,0.393241,,,,,Insufficient Data
141,administradora ipanema s.a.,Mid Company,1.615665,0.820433,0.731377,,,Redução
255,agro florestal são caetano s.a.,Mid Company,12.380952,0.469671,0.729490,,,Redução
...,...,...,...,...,...,...,...,...
9847,vérios gestão de recursos s.a.,Mid Company,1.740977,,,,,Insufficient Data
9853,vesper empreendimentos imobiliarios s/a,Mid Company,0.376488,,,,,Insufficient Data
9906,viana investimentos imobiliarios s/a,Mid Company,0.046012,,,,,Insufficient Data
9939,vila espirito santo ii empreendimentos e parti...,Mid Company,,,inf,0.140351,,Redução


In [None]:
combined_data

Unnamed: 0,Company Name,Revenue 2018,Revenue 2019,Revenue 2020,Revenue 2021,Revenue 2022,SGA 2018,SGA 2019,SGA 2020,SGA 2021,SGA 2022,Company Classification
1,2.0 hotéis holding arco ltda.,,,,,,,,,,,Large Company
2,2008 empreendimentos comerciais s.a.,,25.200,20.50,,,,0.750,2.860,,,Large Company
3,220 capital investimentos e participações s.a.,,31.100,,,,,9.460,,,,Large Company
4,2bcapital s.a.,,2.540,2.85,,,,7.060,9.120,,,Large Company
5,2im inteligencia medica s a,2.49,,,,,2.080,,,,,Large Company
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,viterra bioenergia s.a.,690.80,668.900,1114.20,1447.2,2132.1,75.300,85.400,108.400,133.4,164.8,Large Company
9996,viterra brasil s/a,7548.40,6299.100,9060.50,,,64.500,71.200,93.200,,,Large Company
9997,viterra logística de açúcar s.a.,0.00,0.228,1.99,,,0.275,0.383,0.261,,,Large Company
9998,vitivinícola santa maria s.a.,15.90,21.300,,,,7.120,8.020,,,,Large Company
