In [17]:
# Import necessary libraries
# import PyPDF2
# import re
# import nltk
# from nltk.tokenize import word_tokenize
# from nltk.tag import pos_tag
# from nltk.chunk import ne_chunk
# import fitz  # PyMuPDF
# Dashboarding
# import dash
# from dash import dcc
# from dash import html
# from dash.dependencies import Input, Output

# PDF Extractor
import tabula
# Data Manipulation
import pandas as pd
import numpy as np


In [18]:
# Ensure necessary NLTK models are downloaded
#nltk.download('punkt')
#nltk.download('maxent_ne_chunker')
#nltk.download('words')

In [21]:
def extract_and_combine_tables(pdf_path, page_numbers, new_header):
    combined_table = pd.DataFrame()
    
    for page in page_numbers:
        # Extract table from each page
        tables = tabula.read_pdf(pdf_path, pages=page, multiple_tables=True, lattice=True)
        
        if tables:
            df = tables[0]
            # Delete the first three rows from each page
            df = df.iloc[3:]
            combined_table = pd.concat([combined_table, df], ignore_index=True)
    # Keep only the first 5 columns
    combined_table = combined_table.iloc[:, :5]  

    # Set the new header for the combined table
    combined_table.columns = new_header
    
    return combined_table

# Specify your PDF path
pdf_path = '/Users/mohjaiswal/Desktop/ArthyaInvest/dvdfdf.pdf'
pages = [18, 19]  # Pages to extract
new_header = [
    "Profit and Loss", 
    "As at September 30, 2020", 
    "As at March 31, 2020", 
    "As at March 31, 2019", 
    "As at March 31, 2018"
]

# Call the function and assign the result to a variable
df = extract_and_combine_tables(pdf_path, pages, new_header)

# Display the DataFrame
df

Unnamed: 0,Profit and Loss,"As at September 30, 2020","As at March 31, 2020","As at March 31, 2019","As at March 31, 2018"
0,Income,,,,
1,Revenue from operations,2004.57,2475.09,1696.98,1720.40
2,Other income,65.49,146.37,164.00,99.00
3,Total income,2070.06,2621.46,1860.98,1819.40
4,,,,,
5,Expenses,,,,
6,"Content, event and web server",151.97,506.92,442.95,121.35
7,Advertising and promotion,1196.24,1329.26,272.48,427.80
8,Commission,222.66,70.72,31.31,32.26
9,Employee benefits,218.54,318.67,413.84,371.97


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 55 entries, 0 to 54
Data columns (total 5 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   Profit and Loss           45 non-null     object
 1   As at September 30, 2020  35 non-null     object
 2   As at March 31, 2020      35 non-null     object
 3   As at March 31, 2019      35 non-null     object
 4   As at March 31, 2018      35 non-null     object
dtypes: object(5)
memory usage: 2.3+ KB


In [5]:
# Copy the DataFrame
indexed_df = df.copy()

# Set 'Balance Sheet' as the new index for the copied DataFrame
indexed_df.set_index('Profit and Loss', inplace=True)

# Now, indexed_df will have 'Balance Sheet' as its index
# Display the indexed DataFrame to verify the changes
# Set the maximum number of rows to display all the important data points
pd.set_option('display.max_rows', 100)

# Now when you display the DataFrame, all 100 rows will be shown
indexed_df

Unnamed: 0_level_0,"As at September 30, 2020","As at March 31, 2020","As at March 31, 2019","As at March 31, 2018"
Profit and Loss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Income,,,,
Revenue from operations,2004.57,2475.09,1696.98,1720.40
Other income,65.49,146.37,164.00,99.00
Total income,2070.06,2621.46,1860.98,1819.40
,,,,
Expenses,,,,
"Content, event and web server",151.97,506.92,442.95,121.35
Advertising and promotion,1196.24,1329.26,272.48,427.80
Commission,222.66,70.72,31.31,32.26
Employee benefits,218.54,318.67,413.84,371.97


In [6]:
# Remove backticks from the index
indexed_df.index = indexed_df.index.astype(str).str.replace('`', '')

# Check the result
print(indexed_df.index)

Index(['Income', 'Revenue from operations', 'Other income', 'Total income',
       'nan', 'Expenses', 'Content, event and web server',
       'Advertising and promotion', 'Commission', 'Employee benefits',
       'Finance costs', 'Depreciation and amortization', 'Others',
       'Total expenses', 'nan',
       'Profit/(Loss) before exceptional items, share of net\rprofit/(losses) of investments accounted for using the\requity method and tax',
       'Share of loss of Investments accounted using equity\rmethod (net)',
       '(Loss)/profit before exceptional items and tax', 'Exceptional items',
       '(Loss)/profit before tax', 'Tax expense:', 'Current tax',
       'Deferred tax', 'Total tax expense', 'nan', 'Net (loss)/profit', 'nan',
       'Other comprehensive income', 'nan',
       'Items that will not be reclassified to profit and loss:',
       'Re-measurements of defined employee benefit plans',
       'Income tax relating to items that will not be reclassified to\rprofit or los

In [7]:
indexed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55 entries, Income to Diluted
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   As at September 30, 2020  35 non-null     object
 1   As at March 31, 2020      35 non-null     object
 2   As at March 31, 2019      35 non-null     object
 3   As at March 31, 2018      35 non-null     object
dtypes: object(4)
memory usage: 2.1+ KB


In [8]:
# Define a function to clean and convert the values to float
def clean_convert_value(x):
    if isinstance(x, str):
        if x == '-':
            # Convert '-' to NaN
            return np.nan
        x = x.replace('₹', '').replace(',', '')
        if '(' in x and ')' in x:
            return -float(x.replace('(', '').replace(')', ''))
        else:
            return float(x)
    elif pd.isna(x):
        return x
    else:
        return float(x)

# Convert columns 0 to 3 to float
for column in indexed_df.columns[0:4]:  # Selecting columns 0 to 3
    indexed_df[column] = indexed_df[column].apply(clean_convert_value)

# Check the data types and DataFrame
print(indexed_df.dtypes)
indexed_df

As at September 30, 2020    float64
As at March 31, 2020        float64
As at March 31, 2019        float64
As at March 31, 2018        float64
dtype: object


Unnamed: 0_level_0,"As at September 30, 2020","As at March 31, 2020","As at March 31, 2019","As at March 31, 2018"
Profit and Loss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Income,,,,
Revenue from operations,2004.57,2475.09,1696.98,1720.4
Other income,65.49,146.37,164.0,99.0
Total income,2070.06,2621.46,1860.98,1819.4
,,,,
Expenses,,,,
"Content, event and web server",151.97,506.92,442.95,121.35
Advertising and promotion,1196.24,1329.26,272.48,427.8
Commission,222.66,70.72,31.31,32.26
Employee benefits,218.54,318.67,413.84,371.97


In [9]:
indexed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 55 entries, Income to Diluted
Data columns (total 4 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   As at September 30, 2020  32 non-null     float64
 1   As at March 31, 2020      32 non-null     float64
 2   As at March 31, 2019      32 non-null     float64
 3   As at March 31, 2018      33 non-null     float64
dtypes: float64(4)
memory usage: 2.1+ KB


In [10]:
# Define the number of years for CAGR calculation
years = 2  # Adjust this based on your specific data

# Your existing code for calculations
def calculate_change(current, previous):
    if pd.notna(current) and pd.notna(previous) and previous != 0:
        return (current - previous) / previous * 100
    else:
        return 0

indexed_df['YoY 2019-2020'] = indexed_df.apply(lambda row: calculate_change(row[1], row[2]), axis=1)
indexed_df['YoY 2018-2019'] = indexed_df.apply(lambda row: calculate_change(row[2], row[3]), axis=1)
indexed_df['CAGR'] = indexed_df.apply(lambda row: ((row[1] / row[3]) ** (1 / years) - 1) * 100 if pd.notna(row[1]) and pd.notna(row[3]) and row[3] != 0 else 0, axis=1)

# Display the updated DataFrame
indexed_df

  del sys.path[0]


Unnamed: 0_level_0,"As at September 30, 2020","As at March 31, 2020","As at March 31, 2019","As at March 31, 2018",YoY 2019-2020,YoY 2018-2019,CAGR
Profit and Loss,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Income,,,,,0.0,0.0,0.0
Revenue from operations,2004.57,2475.09,1696.98,1720.4,45.852632,-1.361311,19.944622
Other income,65.49,146.37,164.0,99.0,-10.75,65.656566,21.592962
Total income,2070.06,2621.46,1860.98,1819.4,40.864491,2.285369,20.034897
,,,,,0.0,0.0,0.0
Expenses,,,,,0.0,0.0,0.0
"Content, event and web server",151.97,506.92,442.95,121.35,14.441811,265.018541,104.385378
Advertising and promotion,1196.24,1329.26,272.48,427.8,387.837639,-36.306685,76.272506
Commission,222.66,70.72,31.31,32.26,125.870329,-2.944823,48.060409
Employee benefits,218.54,318.67,413.84,371.97,-22.99681,11.256284,-7.441431


In [13]:
# Assuming indexed_df is your DataFrame

# Temporarily reset the index for styling
temp_df = indexed_df.reset_index()

# Convert string 'nan' to actual NaN values in 'Profit and Loss' column
temp_df['Profit and Loss'] = temp_df['Profit and Loss'].replace('nan', np.nan)

# Drop rows where 'Profit and Loss' column contains NaN
temp_df = temp_df.dropna(subset=['Profit and Loss'])

# Define the styling function to apply bold only to rows where the index contains 'Total'
def bold_if_total_in_index(row):
    if 'Total' in row['Profit and Loss']:
        return ['font-weight: bold'] * len(row)
    else:
        return [''] * len(row)

# Apply the styling
styled_df = temp_df.style.apply(bold_if_total_in_index, axis=1)

# Assume 'numeric_columns' is a list of the names of your numeric columns
numeric_columns = ['As at September 30, 2020', 'As at March 31, 2020', 'As at March 31, 2019', 'As at March 31, 2018', 'YoY 2019-2020', 'YoY 2018-2019', 'CAGR']

# Apply number formatting only to numeric columns
styled_df = styled_df.format({col: "{:.2f}" for col in numeric_columns})

# Display the styled DataFrame
styled_df

Unnamed: 0,Profit and Loss,"As at September 30, 2020","As at March 31, 2020","As at March 31, 2019","As at March 31, 2018",YoY 2019-2020,YoY 2018-2019,CAGR
0,Income,,,,,0.0,0.0,0.0
1,Revenue from operations,2004.57,2475.09,1696.98,1720.4,45.85,-1.36,19.94
2,Other income,65.49,146.37,164.0,99.0,-10.75,65.66,21.59
3,Total income,2070.06,2621.46,1860.98,1819.4,40.86,2.29,20.03
5,Expenses,,,,,0.0,0.0,0.0
6,"Content, event and web server",151.97,506.92,442.95,121.35,14.44,265.02,104.39
7,Advertising and promotion,1196.24,1329.26,272.48,427.8,387.84,-36.31,76.27
8,Commission,222.66,70.72,31.31,32.26,125.87,-2.94,48.06
9,Employee benefits,218.54,318.67,413.84,371.97,-23.0,11.26,-7.44
10,Finance costs,5.11,12.37,13.84,18.34,-10.62,-24.54,-17.87


In [None]:
# Define the path where you want to save the Excel file
file_path = '/Users/mohjaiswal/Desktop/ArthyaInvest/PNL_financial_statement1.xlsx'  # Update this to your desired path

# Save the styled DataFrame to Excel at the specified path
styled_df.to_excel(file_path, engine='openpyxl', index=False)