In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error


In [4]:
# Load the dataset
data = pd.read_csv('merged.csv')

data = data.dropna(subset=['Market Value (Euros)'])

dropped = data.drop(['Market Value (Euros)', 'Highest Market Value (Euros)'], axis=1)

# Select the relevant features and target variable
features_scale = dropped.drop(['PlayerName', 'Citizenship 1', 'Position', 'Position 2', 'Foot', 'Agent', 'ContractExpiration', 'nationality', 'Affiliation', 'League'], axis=1)
features_encode = dropped[[ 'Citizenship 1', 'Position', 'Position 2', 'Foot', 'Agent', 'ContractExpiration', 'nationality', 'Affiliation', 'League']]
target = data['Market Value (Euros)']

# Preprocess the scaled features
scaled_features = features_scale.apply(pd.to_numeric, errors='coerce').values
scaled_features = np.nan_to_num(scaled_features)

# One-hot encode the categorical features
encoder = OneHotEncoder()
encoded_features = encoder.fit_transform(features_encode).toarray()

# Concatenate the scaled features and encoded features
features = np.concatenate((scaled_features, encoded_features), axis=1)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.2, random_state=42)

# Create the linear regression model
lin_reg = LinearRegression()

# Fit the model
lin_reg.fit(X_train, y_train)


LinearRegression()

In [5]:
# Evaluate the model
y_pred = lin_reg.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
print('Mean Squared Error:', mse)


Mean Squared Error: 524352591135235.1


In [6]:
# Make predictions
predictions = lin_reg.predict(X_test)

# Print the predicted market values
for i in range(len(predictions)):
    print('Predicted:', predictions[i], 'Actual:', y_test.iloc[i])


Predicted: 32370628.193441987 Actual: 2000000.0
Predicted: -4841015.214939237 Actual: 30000000.0
Predicted: -10010567.955537438 Actual: 400000.0
Predicted: 5739033.908777356 Actual: 1800000.0
Predicted: -4852528.458807826 Actual: 8500000.0
Predicted: -36306230.33326125 Actual: 4000000.0
Predicted: 27714944.05575049 Actual: 17000000.0
Predicted: 538272.9985085726 Actual: 6000000.0
Predicted: 7071812.680857539 Actual: 2500000.0
Predicted: 3491267.137962699 Actual: 1200000.0
Predicted: -988133.6963635683 Actual: 30000000.0
Predicted: 10074246.167583346 Actual: 3000000.0
Predicted: 19948404.57298422 Actual: 600000.0
Predicted: 4396713.49471271 Actual: 12000000.0
Predicted: -11007197.31929624 Actual: 6000000.0
Predicted: 21805005.61036873 Actual: 40000000.0
Predicted: -27279315.210220575 Actual: 250000.0
Predicted: -6956901.589497685 Actual: 2000000.0
Predicted: -19735952.561768413 Actual: 12000000.0
Predicted: 2231769.9107568264 Actual: 2500000.0
Predicted: -16214128.462528348 Actual: 6000