In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
train_data = pd.read_csv(r'train.csv',index_col=0)
test_data = pd.read_csv(r'test.csv',index_col=0)
sample_submission_data=pd.read_csv(r'sample_submission.csv')

In [3]:
train_df=pd.DataFrame(train_data)
test_df=pd.DataFrame(test_data)
sample_submission_data_df=pd.DataFrame(sample_submission_data)

In [4]:
train_df.info()
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 300000 entries, 0 to 299999
Data columns (total 10 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0   Brand                 290295 non-null  object 
 1   Material              291653 non-null  object 
 2   Size                  293405 non-null  object 
 3   Compartments          300000 non-null  float64
 4   Laptop Compartment    292556 non-null  object 
 5   Waterproof            292950 non-null  object 
 6   Style                 292030 non-null  object 
 7   Color                 290050 non-null  object 
 8   Weight Capacity (kg)  299862 non-null  float64
 9   Price                 300000 non-null  float64
dtypes: float64(3), object(7)
memory usage: 25.2+ MB
<class 'pandas.core.frame.DataFrame'>
Index: 200000 entries, 300000 to 499999
Data columns (total 9 columns):
 #   Column                Non-Null Count   Dtype  
---  ------                --------------   -----  
 0 

In [5]:
numerical = train_df.select_dtypes(include=['float64', 'int32', 'int64']).columns.tolist()
categorical = train_df.select_dtypes(include=['object']).columns.tolist()

In [6]:
numerical

['Compartments', 'Weight Capacity (kg)', 'Price']

In [7]:
numerical.remove('Price')
numerical.remove('Compartments')

In [8]:
numerical

['Weight Capacity (kg)']

In [9]:
categorical

['Brand',
 'Material',
 'Size',
 'Laptop Compartment',
 'Waterproof',
 'Style',
 'Color']

In [10]:
for col in categorical:
    mode_val = train_df[col].mode()[0]
    train_df[col]=train_df[col].fillna(mode_val)
    test_df[col]=test_df[col].fillna(mode_val)

In [11]:
for col in numerical:
    mean_val=train_df[col].mean()
    train_df[col]=train_df[col].fillna(mean_val)
    test_df[col]=test_df[col].fillna(mean_val)

In [12]:
from sklearn.preprocessing import OrdinalEncoder

# Define the ordinal encoder
size_encoder = OrdinalEncoder(categories=[['Small', 'Medium', 'Large']])

# Fit and transform the 'Size' column in train_df
train_df['Size'] = size_encoder.fit_transform(train_df[['Size']])

# Transform the 'Size' column in test_df
test_df['Size'] = size_encoder.transform(test_df[['Size']])

In [13]:
# Ensure the columns to be encoded are present in the dataframes
categorical_columns = [col for col in categorical if col != 'Size' and col in train_df.columns and col in test_df.columns]

# Apply one-hot encoding on categorical columns except 'Size'
train_df = pd.get_dummies(train_df, columns=categorical_columns)
test_df = pd.get_dummies(test_df, columns=categorical_columns)

# Ensure both train and test dataframes have the same columns after encoding
train_df, test_df = train_df.align(test_df, join='left', axis=1, fill_value=0)

In [14]:
X_train=train_df.drop('Price',axis=1)
y_train=train_df['Price']
X_test=test_df

In [15]:
model=LinearRegression()
model.fit(X_train,y_train)

In [16]:
# Drop the 'Price' column from X_test
X_test = X_test.drop(columns=['Price'])

# Make predictions
predictions = model.predict(X_test)

# Create a submission dataframe
submission = pd.DataFrame({'id': test_df.index, 'Price': predictions.round(3)})

# Save the submission dataframe to a CSV file
submission.to_csv('LR02.csv', index=False)