In [None]:
import pandas as pd
import numpy as np
import IPython.display as display
from matplotlib import pyplot as plt
import seaborn as sns
import io
import base64
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
df=pd.read_csv('/content/vgsales.csv')

In [None]:
df.head()  #Let's see the data

In [None]:
df.shape  #Let's see its size

In [None]:
data=df.copy()  #take a part to avoid loss

In [None]:
df.isnull().sum() #know the missing values

In [None]:
df.info()   #Data type

In [None]:
df.drop('Year',axis=1, inplace=True) #delete it because it is not important

In [None]:
df.set_index('Rank', inplace=True)  #put it in the index place

In [None]:
df.drop('Name',axis=1, inplace=True) #delete it because it is not important

In [None]:
df.head()

In [None]:
valid_values = df['Publisher'].dropna().values

In [None]:
df['Publisher'] = df['Publisher'].fillna(
    pd.Series(np.random.choice(valid_values, size=df['Publisher'].isna().sum()), index=df[df['Publisher'].isna()].index)
)
           #choose random values ​​to fill in the blank.

In [None]:
df['Publisher'].isnull().sum()  #make sure it is filled out

In [None]:
df.head()

In [None]:
df.drop_duplicates(inplace=True)  #wipe frequently

In [None]:
numerical_features = df.select_dtypes(include=np.number).columns

for feature in numerical_features:
  plt.figure(figsize=(8, 6))
  plt.boxplot(df[feature].dropna())       #know if it is in outlier or not?
  plt.title(f"Box Plot of {feature}")
  plt.ylabel(feature)
  plt.show()

In [None]:
le = LabelEncoder()
df['Platform'] = le.fit_transform(df['Platform'])  #do encoding

In [None]:
le = LabelEncoder()
df['Genre'] = le.fit_transform(df['Genre'])   #do encoding

In [None]:
le = LabelEncoder()
df['Publisher'] = le.fit_transform(df['Publisher'])   #do encoding

In [None]:
scaler = MinMaxScaler()
df= scaler.fit_transform(df)   #do normalization

In [None]:
df=pd.DataFrame(df)   #change it to be in data frame shape

In [None]:
plt.boxplot(df)
plt.title('Boxplot of column_name')
plt.ylabel('Values')       #make sure that normalization has occurred.
plt.show()

In [None]:
df.head()

In [None]:
df = df.rename(columns={0: 'Platform'})
df = df.rename(columns={1: 'Genre'})
df = df.rename(columns={2: 'Publisher'})
df = df.rename(columns={3: 'NA_Sales'})
df = df.rename(columns={4: 'EU_Sales'})
df = df.rename(columns={5: 'JP_Sales'})
df = df.rename(columns={6: 'Other_Sales'})
df = df.rename(columns={7: 'Global_Sales'})

In [None]:
corr_matrix = df.corr()
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', linewidths=0.5)
plt.title('Correlation Matrix Heatmap')                                # see the relationship between the variables.
plt.show()

In [None]:
X = df.drop('Global_Sales', axis=1)
y= df['Global_Sales']
 #store the input in X and the target in Y

In [None]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)        #do normalization

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
 # divide the test and the train

In [None]:
model = LinearRegression()   #make a fit for the model
model.fit(X_train, y_train)

In [None]:
# Predict & evaluate
from sklearn.metrics import r2_score
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

In [None]:
print("R² train:", r2_score(y_train, y_train_pred))

In [None]:
print("R² test:", r2_score(y_test, y_test_pred))

In [None]:
print("Intercept:", model.intercept_)
print("Coefficients:", model.coef_)

In [None]:
df['Global_Sales'].value_counts(normalize=True)

In [None]:

residuals = y_test - y_test_pred

plt.figure(figsize=(8,5))
sns.scatterplot(x=y_test_pred, y=residuals)
plt.axhline(0, color='red', linestyle='--')
plt.xlabel('Predicted Values')
plt.ylabel('Residuals')
plt.title('Residual Plot')
plt.show()


In [None]:
plt.figure(figsize=(6,6))
sns.scatterplot(x=y_test, y=y_test_pred)
plt.plot([y_test.min(), y_test.max()], [y_test.min(), y_test.max()], '--r')
plt.xlabel('Actual')
plt.ylabel('Predicted')
plt.title('Actual vs Predicted')
plt.show()

In [None]:
sns.histplot(residuals, kde=True)
plt.title('Distribution of Residuals')
plt.xlabel('Residual')
plt.show()