In [None]:
# PHASE-3

#Now make only 1 dataframe of all above 3 csv file using concat/merge /join operation of pandas and start doing EDA .

#Do the complete EDA in details to explore the insights of data and write the detailed observations of each analysis . Write code in python

In [None]:
import pandas as pd

table1_df = pd.read_csv('Table1.csv')
table2_df = pd.read_csv('Table2.csv')
table3_df = pd.read_csv('Table3.csv')

merged_df = pd.merge(table1_df, table2_df, on='Sno', how='inner')
merged_df = pd.merge(merged_df, table3_df, on='Sno', how='inner')

print(merged_df.head())
print(merged_df.info())

print(merged_df.describe())

print(merged_df.isnull().sum())

for column in merged_df.select_dtypes(include='object').columns:
    print(column)
    print(merged_df[column].value_counts())
    print()

correlation_matrix = merged_df.corr()
print(correlation_matrix)

import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(12, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', fmt=".2f")
plt.title('Correlation Matrix')
plt.show()

merged_df.hist(figsize=(12, 8))
plt.suptitle('Histograms of Numerical Features', x=0.5, y=1.02, ha='center', fontsize='large')
plt.tight_layout()
plt.show()

for column in merged_df.select_dtypes(include='object').columns:
    plt.figure(figsize=(8, 6))
    sns.countplot(data=merged_df, x=column, order=merged_df[column].value_counts().index)
    plt.title(f'Distribution of {column}')
    plt.xticks(rotation=45)
    plt.show()


In [None]:
#PHASE - 4

#From the above information . Please do code in python and machine learning. Write the complete Machine learning code to make predictions of price and occasion .Use appropriate models on their label basis. Remember you need to make 2 different predictions: price and occasion   .

#Apply all the best techniques of scaling, hyperparameter tuning, avoid underfitting or overfitting (bias/variance)

#At the end save the best model and convey on which basis you have chosen that model.  

In [2]:


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.metrics import mean_squared_error, accuracy_score
import joblib

X_price = merged_df.drop(['Price', 'Occasion'], axis=1)
y_price = merged_df['Price']
X_occasion = merged_df.drop(['Price', 'Occasion'], axis=1)
y_occasion = merged_df['Occasion']

X_train_price, X_test_price, y_train_price, y_test_price = train_test_split(X_price, y_price, test_size=0.2, random_state=42)
X_train_occasion, X_test_occasion, y_train_occasion, y_test_occasion = train_test_split(X_occasion, y_occasion, test_size=0.2, random_state=42)

numeric_features = X_price.select_dtypes(include=['int64', 'float64']).columns
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features)
])

models = {
    'price': RandomForestRegressor(),
    'occasion': LogisticRegression()
}

pipelines = {}
for label, model in models.items():
    pipelines[label] = Pipeline([
        ('preprocessor', preprocessor),
        ('model', model)
    ])

for label, pipeline in pipelines.items():
    if label == 'price':
        pipeline.fit(X_train_price, y_train_price)
    elif label == 'occasion':
        pipeline.fit(X_train_occasion, y_train_occasion)

for label, pipeline in pipelines.items():
    if label == 'price':
        y_pred_price = pipeline.predict(X_test_price)
        mse = mean_squared_error(y_test_price, y_pred_price)
        print(f'Mean Squared Error for {label}: {mse}')
    elif label == 'occasion':
        y_pred_occasion = pipeline.predict(X_test_occasion)
        accuracy = accuracy_score(y_test_occasion, y_pred_occasion)
        print(f'Accuracy Score for {label}: {accuracy}')

best_model_label = min(pipelines, key=lambda x: mean_squared_error(y_test_price, pipelines[x].predict(X_test_price)))
joblib.dump(pipelines[best_model_label], f'best_model_{best_model_label}.joblib')
print(f'Best model ({best_model_label}) saved successfully.')


NameError: name 'merged_df' is not defined