In [2]:
import warnings
warnings.filterwarnings("ignore")

import optuna
import xgboost as xgb
import pandas as pd
import matplotlib.pyplot as plt
import scipy.stats as stats
import numpy as np
import seaborn as sns
from sklearn.model_selection import GroupKFold
from sklearn.metrics import accuracy_score, classification_report, mean_absolute_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import LinearSVC
from sklearn.preprocessing import RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.decomposition import PCA
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, accuracy_score, median_absolute_error
from imblearn.over_sampling import RandomOverSampler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import mean_squared_error, r2_score
import lightgbm as lgb
from sklearn.model_selection import KFold
from sklearn.ensemble import GradientBoostingRegressor, StackingRegressor
from sklearn.linear_model import Ridge
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error
import catboost as cb
from scipy.optimize import minimize

In [3]:
train = pd.read_csv("dataset/train.csv")
test = pd.read_csv("dataset/test.csv")
extra = pd.read_csv("dataset/training_extra.csv")

In [4]:
train.head()

Unnamed: 0,id,Brand,Material,Size,Compartments,Laptop Compartment,Waterproof,Style,Color,Weight Capacity (kg),Price
0,0,Jansport,Leather,Medium,7.0,Yes,No,Tote,Black,11.611723,112.15875
1,1,Jansport,Canvas,Small,10.0,Yes,Yes,Messenger,Green,27.078537,68.88056
2,2,Under Armour,Leather,Small,2.0,Yes,No,Messenger,Red,16.64376,39.1732
3,3,Nike,Nylon,Small,8.0,Yes,No,Messenger,Green,12.93722,80.60793
4,4,Adidas,Canvas,Medium,1.0,Yes,Yes,Messenger,Green,17.749338,86.02312


In [5]:
num_train_rows, num_train_columns = train.shape
num_test_rows, num_test_columns = test.shape
num_extra_rows, num_extra_columns = extra.shape

print("train_data")
print(f"Number of Rows: {num_train_rows}")
print(f"Number of columns: {num_train_columns}")

print("test_data")
print(f"Number of Rows: {num_test_rows}")
print(f"Number of columns: {num_test_columns}")

print("extra_data")
print(f"Number of Rows: {num_extra_rows}")
print(f"Number of columns: {num_extra_columns}")

train_data
Number of Rows: 300000
Number of columns: 11
test_data
Number of Rows: 200000
Number of columns: 10
extra_data
Number of Rows: 3694318
Number of columns: 11


In [11]:
missing_values_train = pd.DataFrame({"Feature": train.columns,
                                     "[TRAIN] No. of Missing Values": train.isnull().sum().values,
                                     "[TRAIN] % of missing Values": ((train.isnull().sum().values) / len(train)*100 )})

missing_values_test = pd.DataFrame({'Feature': test.columns,
                             '[TEST] No.of Missing Values': test.isnull().sum().values,
                             '[TEST] % of Missing Values': ((test.isnull().sum().values)/len(test)*100)})

missing_values_extra = pd.DataFrame({'Feature': extra.columns,
                             '[EXTRA] No.of Missing Values': extra.isnull().sum().values,
                             '[EXTRA] % of Missing Values': ((extra.isnull().sum().values)/len(extra)*100)})

unique_values = pd.DataFrame({'Feature': train.columns,
                              'No. of Unique Values[FROM TRAIN]': train.nunique().values})

feature_types = pd.DataFrame({'Feature': train.columns,
                              'DataType': train.dtypes})

merged_df = pd.merge(missing_values_train, missing_values_test, on="Feature", how="left")
merged_df = pd.merge(merged_df, missing_values_extra, on='Feature', how="left")
merged_df = pd.merge(merged_df, unique_values, on="Feature", how="left")
merged_df = pd.merge(merged_df, feature_types, on="Feature", how="left")

merged_df

Unnamed: 0,Feature,[TRAIN] No. of Missing Values,[TRAIN] % of missing Values,[TEST] No.of Missing Values,[TEST] % of Missing Values,[EXTRA] No.of Missing Values,[EXTRA] % of Missing Values,No. of Unique Values[FROM TRAIN],DataType
0,id,0,0.0,0.0,0.0,0,0.0,300000,int64
1,Brand,9705,3.235,6227.0,3.1135,117053,3.16846,5,object
2,Material,8347,2.782333,5613.0,2.8065,102615,2.777644,4,object
3,Size,6595,2.198333,4381.0,2.1905,81190,2.197699,3,object
4,Compartments,0,0.0,0.0,0.0,0,0.0,10,float64
5,Laptop Compartment,7444,2.481333,4962.0,2.481,91089,2.465651,2,object
6,Waterproof,7050,2.35,4811.0,2.4055,87274,2.362385,2,object
7,Style,7970,2.656667,5153.0,2.5765,96210,2.60427,3,object
8,Color,9950,3.316667,6785.0,3.3925,123667,3.347492,6,object
9,Weight Capacity (kg),138,0.046,77.0,0.0385,1670,0.045205,181596,float64


In [13]:
train_duplicates = train.duplicated().sum()

test_duplicates = test.duplicated().sum()

extra_duplicates = test.duplicated().sum()

print(f"Number of duplicate rows in train:{train_duplicates}")
print(f"Number of duplicate rows in test:{test_duplicates}")
print(f"Number of duplicate rows in extra:{extra_duplicates}")

Number of duplicate rows in train:0
Number of duplicate rows in test:0
Number of duplicate rows in extra:0


In [15]:
train.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
id,300000.0,149999.5,86602.684716,0.0,74999.75,149999.5,224999.25,299999.0
Compartments,300000.0,5.44359,2.890766,1.0,3.0,5.0,8.0,10.0
Weight Capacity (kg),299862.0,18.029994,6.966914,5.0,12.097867,18.068614,24.002375,30.0
Price,300000.0,81.411107,39.03934,15.0,47.38462,80.95612,115.01816,150.0
