**Code attribution**

Following code is attributed to Ruba Alomari (https://github.com/rubaomari/data/blob/main/student/student-mat-modified-RA.txt).

In [None]:
# Import libraries
import sklearn
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Download the dataset
url = "https://raw.githubusercontent.com/mdeltano/sdata/main/2022-2023%20Football%20Player%20Stats.csv"
players = pd.read_csv(url, sep=';', encoding='latin-1')

# Create a backup copy of the dataset
players_backup = players

In [None]:
players

In [None]:
# Look at the first 5 rows
players.head()

In [None]:
# Look at a summary of the numerical attributes
players.describe()

In [None]:
# Look at a quick description of the data
players.info()

In [None]:
# Look at the categorical data, example: the variable "Pos"
players["Pos"].value_counts()

In [None]:
# Checking for correlations with regard to our target
pd.set_option('display.max_rows', None)
corr_matrix = players.corr(numeric_only=True)
corr_matrix["Goals"].sort_values(ascending=False).head(10)

In [None]:
# Plot G/SoT vs. Goals
g = sns.lineplot(x="G/SoT", y="Goals", data=players, errorbar=None)

In [None]:
# Create a scatterplot of G/SoT and Goals
X = players["G/SoT"]
y = players["Goals"]

# Plot points
fig, pl = plt.subplots()
pl.scatter(X, y, color = 'b')
plt.xlabel("G/SoT")
plt.ylabel("Goals")

In [None]:
# Check for duplicate rows and delete them
players.duplicated().sum()

In [None]:
# Remove duplicates
players.drop_duplicates(inplace=True)

In [None]:
# Find the number of missing values in each column
players.isna().sum()

In [None]:
# Dropping features
players.drop(labels=['Player'], axis=1, inplace=True)
players.drop(labels=['Rk'], axis=1, inplace=True)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

In [None]:
# Create the cat and num columns
num_cols = players.select_dtypes(include='number').columns.to_list()
cat_cols = players.select_dtypes(exclude='number').columns.to_list()


# Exclude the target from numerical columns
num_cols.remove("Goals")


# Create pipelines for numeric and categorical columns
num_pipeline = make_pipeline(SimpleImputer(strategy='mean'), StandardScaler())
cat_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), OneHotEncoder(sparse_output=False))

# Use ColumnTransformer to set the estimators and transformations
preprocessing = ColumnTransformer([('num', num_pipeline, num_cols),
                                   ('cat', cat_pipeline, cat_cols)],
                                    remainder='passthrough'
                                 )
preprocessing.set_output(transform="pandas")

# Show the pipeline
preprocessing

In [None]:
# Apply the preprocessing pipeline on the dataset
players_prepared = preprocessing.fit_transform(players)


# Scikit-learn strips the column headers in most cases, so just add them back on afterward
feature_names=preprocessing.get_feature_names_out()
players_prepared = pd.DataFrame(data=players_prepared, columns=feature_names)

players_prepared

In [None]:
from sklearn.model_selection import train_test_split

X = players_prepared.drop(["remainder__Goals"], axis=1)
y = players_prepared["remainder__Goals"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(X_train.shape, y_train.shape, X_test.shape, y_test.shape)

In [None]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import SelectFromModel

In [None]:
sel = SelectFromModel(RandomForestClassifier())
sel.fit(X_train, y_train)

In [None]:
sel.get_support()

In [None]:
selected_feat= X_train.columns[(sel.get_support())]
len(selected_feat)

In [None]:
print(selected_feat)

In [None]:
from sklearn.linear_model import LinearRegression

lr_model = LinearRegression()
lr_model.fit(X_train, y_train)

In [None]:
# Predict the outcome of the test data

lr_y_predict = lr_model.predict(X_test)

from sklearn.metrics import mean_squared_error as mse
lr_mse=mse(y_test, lr_y_predict)
lr_mse

In [None]:
from sklearn.model_selection import cross_val_score, KFold

scores = cross_val_score(lr_model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')

scores

In [None]:
mean_score = -scores.mean()  # Take the negative value to get the mean squared error

print(f'Cross-Validation Mean Score: {mean_score}')

In [None]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso

RidgeRegression = Ridge(alpha=1)
ridge_model = RidgeRegression.fit(X_train, y_train)

LassoRegression = Lasso(alpha=1)
lasso_model = LassoRegression.fit(X_train, y_train)

In [None]:
Ridge_y_predict = ridge_model.predict(X_test)
Lasso_y_predict = lasso_model.predict(X_test)
ridge_mse = mse(y_test, Ridge_y_predict)
lasso_mse=mse(y_test, Lasso_y_predict)

In [None]:
print(f'Linear Regression MSE: {lr_mse}')
print(f'Cross-Validation MSE: {mean_score}')
print(f'Ridge Regression MSE: {ridge_mse}')
print(f'Lasso Regression MSE: {lasso_mse}')

In [None]:
import matplotlib.pyplot as plt

# 'ridge_model' is the best performing trained linear regression model
Ridge_y_predict = ridge_model.predict(X_test)  # X is your feature data
plt.scatter(Ridge_y_predict, y_test)  # y is your actual target values
plt.xlabel("Goals Predicted Values")
plt.ylabel("Goals Actual Values")
plt.title("Predicted vs. Actual Values")
plt.show()

In [None]:
sns.lineplot(x=Ridge_y_predict, y=y_test, errorbar=None)

In [None]:
from pandas import Series
predictors = X_train.columns
coef = Series(ridge_model.coef_,predictors).sort_values()
coef.plot(kind='bar', title='Model Coefficients')

#Attribute https://www.analyticsvidhya.com/blog/2017/06/a-comprehensive-guide-for-linear-ridge-and-lasso-regression/