<a href="https://colab.research.google.com/github/neerajgoyal12/electoral_bonds/blob/main/notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [63]:
import pandas as pd
from pathlib import Path
import requests
import matplotlib.pyplot as plt

# Download helper functions from Learn PyTorch repo (if not already downloaded)
if Path("helper_functions.py").is_file():
  print("helper_functions.py already exists, skipping download")
else:
  print("Downloading helper_functions.py")
  request = requests.get("https://raw.githubusercontent.com/neerajgoyal12/electoral_bonds/main/helper_functions.py")
  with open("helper_functions.py", "wb") as f:
    f.write(request.content)

if Path("ec_data.csv").is_file():
  print("ec_data.csv already exists, skipping download")
else:
  print("Downloading ec_data.csv")
  request = requests.get("https://raw.githubusercontent.com/neerajgoyal12/electoral_bonds/main/data/ec_data.csv")
  with open("ec_data.csv", "wb") as f:
    f.write(request.content)


helper_functions.py already exists, skipping download
ec_data.csv already exists, skipping download


In [57]:
df = pd.read_csv('ec_data.csv')

In [64]:
df.head(5)

Unnamed: 0,Date of Purchase,Purchaser Name,Denomination
0,12/Apr/2019,A B C INDIA LIMITED,100000
1,12/Apr/2019,A B C INDIA LIMITED,100000
2,12/Apr/2019,A B C INDIA LIMITED,1000000
3,12/Apr/2019,A B C INDIA LIMITED,1000000
4,12/Apr/2019,A B C INDIA LIMITED,100000


In [65]:
df.tail(5)

Unnamed: 0,Date of Purchase,Purchaser Name,Denomination
18866,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18867,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18868,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18869,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18870,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000


In [66]:
#dropping NaN column
df = df.dropna(axis = 1, how = 'all')
df

Unnamed: 0,Date of Purchase,Purchaser Name,Denomination
0,12/Apr/2019,A B C INDIA LIMITED,100000
1,12/Apr/2019,A B C INDIA LIMITED,100000
2,12/Apr/2019,A B C INDIA LIMITED,1000000
3,12/Apr/2019,A B C INDIA LIMITED,1000000
4,12/Apr/2019,A B C INDIA LIMITED,100000
...,...,...,...
18866,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18867,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18868,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000
18869,11/Jan/2024,WESTWELL GASES PRIVATE LIMITED,10000000


In [67]:
# For future easy ness
dstr = 'Date of Purchase'
pnstr = 'Purchaser Name'
denostr = 'Denomination'

In [68]:
df.infer_objects()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18871 entries, 0 to 18870
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Date of Purchase  18871 non-null  object
 1   Purchaser Name    18871 non-null  object
 2   Denomination      18871 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 442.4+ KB


In [69]:
# Converting dtypes for later convenience
df[dstr] = pd.to_datetime(df[dstr])
df[pnstr] = df[pnstr].astype(str)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18871 entries, 0 to 18870
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   Date of Purchase  18871 non-null  datetime64[ns]
 1   Purchaser Name    18871 non-null  object        
 2   Denomination      18871 non-null  int64         
dtypes: datetime64[ns](1), int64(1), object(1)
memory usage: 442.4+ KB


In [77]:
grouped_df = df.groupby([pnstr, dstr], as_index=False).sum()
print(type(grouped_df))
print(grouped_df.tail())

<class 'pandas.core.frame.DataFrame'>
                    Purchaser Name Date of Purchase  Denomination
1975  ZUVAN ENERGY PRIVATE LIMITED       2023-04-11      10000000
1976          ZUVAN ENERGY PVT LTD       2022-04-08      20000000
1977      ZYDUS HEALTHCARE LIMITED       2022-10-10     180000000
1978      ZYDUS HEALTHCARE LIMITED       2022-11-14      30000000
1979      ZYDUS HEALTHCARE LIMITED       2023-07-10      80000000


In [78]:
len(grouped_df)

1980

In [90]:
import numpy as np
pn = grouped_df[pnstr].to_numpy()
upn = np.unique(pn)

In [None]:
upn = np.unique(grouped_df[pnstr].to_numpy())
for i in range(len(upn)):
  matching = upn[i]
  for j in range(len(grouped_df)):
    name1 = matching
    name2 = grouped_df.loc[j, pnstr]
    names1 = name1.split()
    names2 = name2.split()
    first_similarity = jaro_similarity(names1[0], names2[0])
    if first_similarity < 0.97:
      continue
    else:
      similarity = jaro_similarity(name1, name2)
      if similarity > 0.85:
        if similarity < 1:
          print(f"{name1} | {name2} | {similarity}")

In [172]:
dic = {
    "ASKUS LOGISTICS PRIV": "ASKUS LOGISTICS PRIVATE LIMITED",
    "ASKUS LOGISTICS PVT LTD": "ASKUS LOGISTICS PRIVATE LIMITED",
    "AUROBINDO PHARMA LTD": "AUROBINDO PHARMA LIMITED",
    "AVEES TRADING & FINANCE PVT LTD": "AVEES TRADING AND FINANCE PRIVATE LIMITED",
    "AVEES TRADING AND FINANCE PVT LTD": "AVEES TRADING AND FINANCE PRIVATE LIMITED",
    "AVEES TRADING FINANCE PVT LTD": "AVEES TRADING AND FINANCE PRIVATE LIMITED",
    "BALU IRON & STEEL COMPANY": "BALU IRON AND STEEL COMPANY",
    "CIPLA LTD": "CIPLA LIMITED",
    "COMFORT TRIMS PRIVATE LIMITED DIVIS": "COMFORT TRIMS PRIVATE LIMITED",
    "CROCHET TRADE AND INVESTMENT PVT LT": "CROCHET TRADE AND INVESTMENT PRIVATE LIMITED",
    "CROCHET TRADE AND INVESTMENT PVT L": "CROCHET TRADE AND INVESTMENT PRIVATE LIMITED",
    "FUTURE GAMING AND HOTEL SERVICES PR ": "FUTURE GAMING AND HOTEL SERVICES PRIVATE LIMITED",
    "FUTURE GAMING AND HOTEL SERVICES PVT LTD": "FUTURE GAMING AND HOTEL SERVICES PRIVATE LIMITED",
    "D S ENGINEERING WORKSHOP LLP": "D S ENGINEERING WORKS LLP",
    "DERIVE TRADING AND RESORTS PRIVATE LIMIT": "DERIVE TRADING AND RESORTS PRIVATE LIMITED",
    "DIVYESH POWER PVT LTD": "DIVYESH POWER PRIVATE LIMITED",
    "FUTURE GAMING AND HOTEL SERVICES PR": "FUTURE GAMING AND HOTEL SERVICES PRIVATE LIMITED",
    "DCM SHRIRAM LTD": "DCM SHRIRAM LIMITED",
    "ELENA RENEWABLE ENERGY PVT LTD": "ELENA RENEWABLE ENERGY PRIVAE LIMITED",
    "GENUS POWER INFRASTRUCTURES LTD": "GENUS POWER INFRASTRUCTURES LIMITED",
    "GOODLUCK INDIA LTD": "GOODLUCK INDIA LIMITED",
    "GREENKO RAYALA WIND POWER PVT LTD": "GREENKO RAYALA WIND POWER PRIVATE LIMITED",
    "GREENKO RAYALA WIND POWER PRIVATE LIMITE D": "GREENKO RAYALA WIND POWER PRIVATE LIMITED",
    "HONOUR LAB LTD": "HONOUR LAB LIMITED",
    "INORBIT MALLS INDIA PRIVATE LIMIT": "INORBIT MALLS INDIA PRIVATE LIMITED",
    "ELENA RENEWABLE ENERGY PRIVAE LIMITED": "ELENA RENEWABLE ENERGY PRIVATE LIMITED",
    "J K CEMENT LTD.": "J K CEMENT LIMITED",
    "JAI SUSPENSION SYSTEMS L L P": "JAI SUSPENSION SYSTEMS LLP",
    "LAXMI INDUSTRIAL BOTTLING PLAN": "LAXMI INDUSTRIAL BOTTLING PLANT",
    "MADHYA PRADESH WASTE MANAGEMENT PR": "MADHYA PRADESH WASTE MANAGEMENT PRIVATE LIMITED",
    "MADHYA PRADESH WASTE MANAGEMENT PRI": "MADHYA PRADESH WASTE MANAGEMENT PRIVATE LIMITED",
    "MEGHA ENGINEERING & INFRASTRUCTURES LIMITED": "MEGHA ENGINEERING AND INFRASTRUCTURES LIMITED",
    "MEGHA ENGINEERING AND INFRASTRUCTURES LI MITED": "MEGHA ENGINEERING AND INFRASTRUCTURES LIMITED",
    "MEGHA ENGINEERING AND INFRASTRUCTURES LTD": "MEGHA ENGINEERING AND INFRASTRUCTURES LIMITED",
    "MICRO LABS LTD": "MICRO LABS LIMITED",
    "MKJ ENTERPRISES LTD": "MKJ ENTERPRISES LIMITED",
    "MKK METAL SECTIONS P LTD": "MKK METAL SECTIONS PRIVATE LIMITED",
    "MKK METAL SECTIONS PVT LTD": "MKK METAL SECTIONS PRIVATE LIMITED",
    "MODERN ROAD MAKERS PVT LTD": "MODERN ROAD MAKERS PRIVAE LIMITED",
    "MODERN ROAD MAKERS PVT. LTD.": "MODERN ROAD MAKERS PRIVAE LIMITED",
    "MOHIT MINERALS LTD": "MOHIT MINERALS LIMITED",
    "MY HOME INFRASTRUCTURES PRIVATE LI": "MY HOME INFRASTRUCTURES PRIVATE LIMITED",
    "MY HOME INFRASTRUCTURES PVT LTD MY": "MY HOME INFRASTRUCTURES PRIVATE LIMITED",
    "MYTRAH ENERGY(INDIA) PRIVATE LIMITED": "MYTRAH ENERGY INDIA PRIVATE LIMITED",
    "NATCO PHARMA LTD": "NATCO PHARMA LIMITED",
    "NAVAYUGA ENGINEERING CO LTD": "NAVAYUGA ENGINEERING COMPANY LIMITED",
    "NSL SEZ(HYDERABAD) PRIVATE LIMITED": "NSL SEZ HYDERABAD PRIVATE LIMITED",
    "NUVOCO VISTAS CORP. LTD": "NUVOCO VISTAS CORPORATION LIMITED",
    "NUVOCO VISTAS CORPORATION LTD": "NUVOCO VISTAS CORPORATION LIMITED",
    "PALM SHELTER ESTATE DEVELOPMENT LL": "PALM SHELTER ESTATE DEVELOPMENT LLP",
    "PENGUIN TRADING & AGENCIES LIMITED": "PENGUIN TRADING AND AGENCIES LIMITED",
    "PENGUIN TRADING AND AGENCIES LTD": "PENGUIN TRADING AND AGENCIES LIMITED",
    "PIRAMAL ENTERPRISES LTD": "PIRAMAL ENTERPRISES LIMITED",


}

In [None]:
for k,v in dic.items():
  inter_df = grouped_df[grouped_df[pnstr] == k]
  print(inter_df)
  for index, row in inter_df.iterrows():
    print(index)
    grouped_df.loc[index, pnstr] = v
    print(v)
  print(grouped_df[104:])

In [None]:
upn = np.unique(grouped_df[pnstr].to_numpy())
for i in range(len(upn)):
  matching = upn[i]
  for j in range(len(grouped_df)):
    name1 = matching
    name2 = grouped_df.loc[j, pnstr]
    names1 = name1.split()
    names2 = name2.split()
    first_similarity = jaro_similarity(names1[0], names2[0])
    if first_similarity < 0.97:
      continue
    else:
      similarity = jaro_similarity(name1, name2)
      if similarity > 0.85:
        if similarity < 1:
          print(f"{name1} | {name2} | {similarity}")

In [175]:
len(upn)

1242

In [181]:
final_grouped_df = grouped_df.groupby([pnstr, dstr], as_index=False).sum()
final_grouped_df.to_csv('01_all_data.csv')

In [184]:
total_df = final_grouped_df.groupby(by=pnstr).sum(numeric_only=True)
total_df = total_df.sort_values(denostr, ascending=False)
total_df.to_csv('00_all_data.csv')

In [14]:
!pip install jellyfish
from jellyfish import jaro_similarity

Collecting jellyfish
  Downloading jellyfish-1.0.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/1.2 MB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.2/1.2 MB[0m [31m5.0 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m1.2/1.2 MB[0m [31m19.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.2/1.2 MB[0m [31m14.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: jellyfish
Successfully installed jellyfish-1.0.3
