Task: Data Analysis Basics

Today we learned the principles of data analysis:

Reading data from different file formats (CSV, XLSX, JSON, DB).

Checking data types and converting them if needed.

Detecting missing (empty) values and processing them properly.

Your Task

1. File 1: file_handler.py

Create a Python file named file_handler.py.

It should contain functions to:

Read files with extensions: CSV, XLSX, JSON, DB.

Save (export) data to the same formats: CSV, XLSX, JSON, DB.

2. File 2: preprocessing.py

Create another Python file named preprocessing.py.

It should contain functions for data preprocessing, such as:

Checking data types of columns.

Converting data types when necessary.

Checking empty values.

Processing missing values (for example: filling them with mean/median/mode, or dropping them).

📝 Notes

Use pandas library for data handling.

Make sure your code is modular: each function should do one specific job.
Keep both files well-structured and easy to reuse in future projects.

In [4]:
import pandas as pd
import numpy as np
import psycopg2
import openpyxl
import sqlalchemy
import warnings
warnings.filterwarnings('ignore')

### Handler.py

In [57]:
import pandas as pd
import psycopg2
import openpyxl
from sqlalchemy import create_engine


def read_csv(file_path):
    df = pd.read_csv(file_path)
    return df
def read_excel(file_path, sheet_name=0):
    df = pd.read_excel(file_path, sheet_name=sheet_name)
    return df

def read_json(file_path):
    df = pd.read_json(file_path)
    return df




def read_db(db_config: dict, table_name: str) -> pd.DataFrame:
    """
    Read table from PostgreSQL into DataFrame.
    
    db_config should be a dict:
    {
        "dbname": "your_db",
        "user": "your_user",
        "password": "your_password",
        "host": "localhost",
        "port": 5432
    }
    """
    conn = psycopg2.connect(**db_config)
    query = f"SELECT * FROM {table_name};"
    df = pd.read_sql(query, conn)
    conn.close()
    return df

# Save functions

def save_csv(df: pd.DataFrame, file_path: str):
    """Save DataFrame to CSV"""
    df.to_csv(file_path+".csv", index=False)
    print("csv saved!")


def save_excel(df: pd.DataFrame, file_path: str, sheet_name: str = "Sheet1"):
    """Save DataFrame to Excel"""
    df.to_excel(file_path+".xlsx", index=False, sheet_name=sheet_name)
    print("excel saved!")


def save_json(df: pd.DataFrame, file_path: str):
    """Save DataFrame to JSON"""
    df.to_json(file_path+".json", orient="records")
    print("json saved!")

# , indent=4

def save_db(df: pd.DataFrame, db_config: dict, table_name: str):
    """
    Save DataFrame to PostgreSQL table.
    
    db_config should be a dict:
    {
        "dbname": "your_db",
        "user": "your_user",
        "password": "your_password",
        "host": "localhost",
        "port": 5432
    }
    """
    # Use SQLAlchemy for easier DataFrame export
    engine = create_engine(
        f"postgresql+psycopg2://{db_config['user']}:{db_config['password']}@"
        f"{db_config['host']}:{db_config['port']}/{db_config['dbname']}"
    )
    # {db_config['port']}/
    df.to_sql(table_name, engine, if_exists="replace", index=False)
    engine.dispose()
    print("db saved!")


### Preprocessing file

In [None]:
# File 2
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

def chk_dtype(df):
    dtypes = df.dtypes
    n_unique = df.nunique()
    return pd.DataFrame({"Dtypes":dtypes, "Num_uniques":n_unique}).T



def change2cat(df):
    cols = df.columns
    dtypes = df.dtypes
    n_unique = df.nunique()
    for col in cols:
        if df[col].nunique() < 10:
            df[col] = df[col].astype("category")
    dtypes = df.dtypes
    n_unique = df.nunique()
    return pd.DataFrame({"Dtypes":dtypes, "Num_uniques":n_unique}).T


#### Check_nulls
def chk_nulls(df):
    null = df.isnull().sum()
    ratio = round((null/df.shape[0])*100,2)
    return pd.DataFrame({"Null_sum":null,"Ratio %": ratio}).T




#### Change_nulls
def change_nulls(df):
    cols = df.columns
    null = df.isnull().sum()
    ratio = round((null/df.shape[0])*100,2)
    for col in cols:
        if ratio[col] > 0:
            if ratio[col] > 50:
                df.drop(col, axis=1, inplace= True)
                print(f"dropped column: {col}")
            else:
                mode =  df[col].mode()
                df[col].fillna(mode[0], inplace= True)
                print(f"filled {col} nulls with mode")
    null = df.isnull().sum()
    ratio = round((null/df.shape[0])*100,2)
        
    return pd.DataFrame({"Null_sum":null,"Ratio %": ratio}).T

In [None]:
db_config = {
    "host" : "localhost",
    "dbname" : "ds_new",
    "user" : "postgres",
    "password" : "456456",
    "port" : "5432"
}
df = read_db(db_config,'students')
df
save_db(df,db_config,'student_test')

In [None]:
path = "G:/DEPI/Depi_Amit_AI_BNS3/Tasks/DataBase/Data_processing_workshop/train_test.csv"
df = read_csv(path)
df

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3.0,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1.0,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3.0,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3.0,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
...,...,...,...,...,...,...,...,...,...,...,...,...
886,887,0,2.0,"Montvila, Rev. Juozas",male,27.0,0,0,211536,13.0000,,S
887,888,1,1.0,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0000,B42,S
888,889,0,3.0,"Johnston, Miss. Catherine Helen ""Carrie""",female,,1,2,W./C. 6607,23.4500,,S
889,890,1,1.0,"Behr, Mr. Karl Howell",male,26.0,0,0,111369,30.0000,C148,C


In [50]:
chk_dtype(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Dtypes,int64,int64,float64,object,object,float64,int64,int64,object,float64,object,object
Num_uniques,891,2,3,891,2,88,7,7,681,248,147,3


In [55]:
change2cat(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
Dtypes,int64,category,category,object,category,float64,category,category,object,float64,category
Num_uniques,891,2,3,891,2,88,7,7,681,248,3


In [52]:
chk_nulls(df)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
Null_sum,0.0,0.0,8.0,0.0,0.0,177.0,0.0,0.0,0.0,0.0,687.0,2.0
Ratio %,0.0,0.0,0.9,0.0,0.0,19.87,0.0,0.0,0.0,0.0,77.1,0.22


In [53]:
change_nulls(df)

filled Pclass nulls with mode
filled Age nulls with mode
dropped column: Cabin
filled Embarked nulls with mode


Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
Null_sum,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Ratio %,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
save_csv(df,"G:/DEPI/Depi_Amit_AI_BNS3/Tasks/DataBase/Data_processing_workshop/test_folder/test.csv")

csv saved!
