In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression 
from sklearn.impute import SimpleImputer
import dtale

In [4]:
train= pd.read_csv('/home/momtahin/Documents/Backpack Prediction Challenge/data/train.csv')
train_extra= pd.read_csv('/home/momtahin/Documents/Backpack Prediction Challenge/data/training_extra.csv')
test= pd.read_csv('/home/momtahin/Documents/Backpack Prediction Challenge/data/test.csv')

In [5]:
# dt = dtale.show(train)
# dte = dtale.show(train_extra)
# dtest = dtale.show(test)
# dt.open_browser()
# dte.open_browser()
# dtest.open_browser()

In [6]:
# Concating train and train_extra 

train = pd.concat([train, train_extra], ignore_index=True)

# concat: https://pandas.pydata.org/docs/reference/api/pandas.concat.html#pandas-concat

In [7]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3994318 entries, 0 to 3994317
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Brand                 object 
 2   Material              object 
 3   Size                  object 
 4   Compartments          float64
 5   Laptop Compartment    object 
 6   Waterproof            object 
 7   Style                 object 
 8   Color                 object 
 9   Weight Capacity (kg)  float64
 10  Price                 float64
dtypes: float64(3), int64(1), object(7)
memory usage: 335.2+ MB


In [8]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200000 entries, 0 to 199999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   id                    200000 non-null  int64  
 1   Brand                 193773 non-null  object 
 2   Material              194387 non-null  object 
 3   Size                  195619 non-null  object 
 4   Compartments          200000 non-null  float64
 5   Laptop Compartment    195038 non-null  object 
 6   Waterproof            195189 non-null  object 
 7   Style                 194847 non-null  object 
 8   Color                 193215 non-null  object 
 9   Weight Capacity (kg)  199923 non-null  float64
dtypes: float64(2), int64(1), object(7)
memory usage: 15.3+ MB


In [9]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test= train_test_split(train.drop(columns='Price'),train['Price'],test_size=0.1)

In [10]:
cat_cols = train.select_dtypes(include='object').columns.to_list()

# select_dtypes: https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.select_dtypes.html#pandas-dataframe-select-dtypes
# t_list(): https://pandas.pydata.org/docs/reference/api/pandas.Series.to_list.html#pandas.Series.to_list

In [11]:
# Define numeric pipeline
num_pipeline = Pipeline(steps=[
    ('scaler', StandardScaler()),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='mean'))
])

# Define categorical pipeline
cat_pipeline = Pipeline(steps=[
    ('encoder', OrdinalEncoder()),
    ('imputer', SimpleImputer(missing_values=np.nan, strategy='most_frequent'))
])

# Combining num_pipeline & cat_pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num_pipeline', num_pipeline, ['Compartments','Weight Capacity (kg)']),
        ('cat_pipeline', cat_pipeline, cat_cols)
])


# Final pipeline
final_pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])



# Fit the pipeline on the training set
final_pipeline.fit(X_train, y_train)

# Test set predictions
y_pred = final_pipeline.predict(X_test)



# ColumnTransformer: https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html#columntransformer


In [12]:
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f'Mean Squared Error: {mse}')
print(f'Root Mean Squared Error: {rmse}')

Mean Squared Error: 1515.5642900472717
Root Mean Squared Error: 38.93024903654319


In [13]:
predictions = final_pipeline.predict(test)


submission = pd.DataFrame({
    'id': test['id'],
    'Price': predictions
})

submission.to_csv('/home/momtahin/Documents/Backpack Prediction Challenge/submission.csv', index=False)