In [1]:
# Importing the required libraries
import pandas as pd
import numpy as np

In [2]:
# Importing the dataset
df = pd.read_csv('D:\\Sem 5\\ML\\train.csv')

In [3]:
# Dropping certain less important features
df = df.drop(columns = ['Year_of_Release', 'Developer', 'Publisher', 'Platform'], axis=1)

In [4]:
# To view the columns with missing values
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13375 entries, 0 to 13374
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   id            13375 non-null  int64  
 1   index         13375 non-null  int64  
 2   Name          13373 non-null  object 
 3   Genre         13373 non-null  object 
 4   NA_Sales      13375 non-null  float64
 5   EU_Sales      13375 non-null  float64
 6   JP_Sales      13375 non-null  float64
 7   Other_Sales   13375 non-null  float64
 8   Global_Sales  13375 non-null  float64
 9   Critic_Score  6505 non-null   float64
 10  Critic_Count  6505 non-null   float64
 11  User_Score    8007 non-null   object 
 12  User_Count    6039 non-null   float64
 13  Rating        7944 non-null   object 
dtypes: float64(8), int64(2), object(4)
memory usage: 1.4+ MB


In [5]:
df.head(1)

Unnamed: 0,id,index,Name,Genre,NA_Sales,EU_Sales,JP_Sales,Other_Sales,Global_Sales,Critic_Score,Critic_Count,User_Score,User_Count,Rating
0,10717,5596,Darkwatch,Shooter,0.16,0.12,0.0,0.04,0.32,74.0,37.0,8.4,42.0,M


In [6]:
L=list(df[df["Name"].isna()].index.values)
df = df.drop(L, axis=0)

In [7]:
L1=list(df[df["Genre"].isna()].index.values)
df = df.drop(L1, axis=0)

In [8]:
df.drop(df.loc[df["User_Score"]=="tbd"].index, inplace=True)
df["User_Score"]= df["User_Score"].astype(np.float64)

In [9]:
df["Critic_Score"] = df['Critic_Score'].fillna(df['Critic_Score'].mean())
df["Critic_Count"] = df['Critic_Count'].fillna(df['Critic_Count'].mean())
df["User_Count"] = df['User_Count'].fillna(df['User_Count'].mean())
df["User_Score"] = df['User_Score'].fillna(df['User_Score'].mean())

df["Rating"] = df['Rating'].fillna(df['Rating'].mode()[0])

In [10]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
ordinal_encoder = OrdinalEncoder()
df["Rating"] = ordinal_encoder.fit_transform(df[["Rating"]])

In [11]:
X = df.drop(['index','Global_Sales'], axis = 1)
Y = df['Global_Sales']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.3, random_state = 0)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(7983, 12) (3422, 12) (7983,) (3422,)


In [12]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

num_transform = Pipeline([('imputer', SimpleImputer(strategy="median")), ('std_scaler', StandardScaler())])
num_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['float64']]

cat_transform = Pipeline(steps=[('imputer', SimpleImputer(strategy='most_frequent')),('onehot', OneHotEncoder(handle_unknown='ignore'))])
cat_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['object'] and X_train[cname].nunique() <= 30]

preprocessor = ColumnTransformer(
    transformers=[
        ('num', num_transform, num_cols),
        ('cat', cat_transform, cat_cols)
    ])
X_train_prep = preprocessor.fit_transform(X_train)

In [13]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model = LinearRegression()
model.fit(X_train_prep, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [14]:
X_test_prep = preprocessor.transform(X_test)
# Predicting test set results
y_pred = model.predict(X_test_prep)

In [15]:
from sklearn.metrics import r2_score, mean_absolute_error
lin_mae = mean_absolute_error(y_test, y_pred)
print("Mean Absolute Error:",lin_mae)
import math
r2_score = r2_score(y_test, y_pred)
rmse = math.sqrt(mean_squared_error(y_test, y_pred))
print(f"r2 score of the model : {r2_score:.3f}")
print(f"Root Mean Squared Error of the model : {rmse:.3f}")

Mean Absolute Error: 0.002787397636022515
r2 score of the model : 1.000
Root Mean Squared Error of the model : 0.005


# Predicting for the test set

In [16]:
X_test_set = pd.read_csv('D:\\Sem 5\\ML\\test.csv')

In [17]:
X_test_set = X_test_set.drop(columns = ['Year_of_Release', 'Developer', 'Publisher', 'Platform'], axis=1)

In [19]:
L2=list(X_test_set[X_test_set["Name"].isna()].index.values)
X_test_set = X_test_set.drop(L2, axis=0)
L3=list(X_test_set[X_test_set["Genre"].isna()].index.values)
X_test_set = X_test_set.drop(L3, axis=0)
X_test_set["User_Score"].replace({"tbd":0}, inplace=True)
X_test_set["User_Score"]= X_test_set["User_Score"].astype(np.float64)

In [20]:
X_test_set["Critic_Score"] = X_test_set['Critic_Score'].fillna(X_test_set['Critic_Score'].mean())
X_test_set["Critic_Count"] = X_test_set['Critic_Count'].fillna(X_test_set['Critic_Count'].mean())
X_test_set["User_Count"] = X_test_set['User_Count'].fillna(X_test_set['User_Count'].mean())
X_test_set["User_Score"] = X_test_set['User_Score'].fillna(X_test_set['User_Score'].mean())

X_test_set["Rating"] = X_test_set['Rating'].fillna(X_test_set['Rating'].mode()[0])

In [21]:
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
ordinal_encoder = OrdinalEncoder()
X_test_set["Rating"] = ordinal_encoder.fit_transform(X_test_set[["Rating"]])

In [22]:
# Predicting test set results for separate test file
X_test_prep1 = preprocessor.transform(X_test_set)
y_pred_test = model.predict(X_test_prep1)

In [23]:
y_pred_test.shape

(3344,)

In [28]:
predictions = pd.DataFrame(y_pred_test,columns = ['Global_Sales'])
predictions['id'] = X_test_set['id']

In [29]:
predictions.to_csv('Predictions.csv')