In [226]:
import numpy as np
import pandas as pd

data = pd.read_csv("data/fifa_2019_player_data.csv", encoding="Windows-1252")

In [227]:
# Modeli bozacak sütunları kaldırdım.
data.drop(columns=["Index", "ID", "Jersey Number", "Contract Valid Until", ], inplace=True)
data.dropna(subset=['Overall'], inplace=True)
data.info()
data.head()


<class 'pandas.core.frame.DataFrame'>
Index: 18206 entries, 0 to 18206
Data columns (total 85 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Name                      18206 non-null  object 
 1   Age                       18205 non-null  float64
 2   Photo                     18206 non-null  object 
 3   Nationality               18206 non-null  object 
 4   Flag                      18206 non-null  object 
 5   Overall                   18206 non-null  float64
 6   Potential                 18206 non-null  int64  
 7   Club                      17965 non-null  object 
 8   Club Logo                 18206 non-null  object 
 9   Value                     18206 non-null  object 
 10  Wage                      18206 non-null  object 
 11  Special                   18206 non-null  int64  
 12  Preferred Foot            18158 non-null  object 
 13  International Reputation  18158 non-null  float64
 14  Weak Foot  

Unnamed: 0,Name,Age,Photo,Nationality,Flag,Overall,Potential,Club,Club Logo,Value,...,Composure,Marking,StandingTackle,SlidingTackle,GKDiving,GKHandling,GKKicking,GKPositioning,GKReflexes,Release Clause
0,L. Messi,31.0,https://cdn.sofifa.org/players/4/19/158023.png,Argentina,https://cdn.sofifa.org/flags/52.png,94.0,94,FC Barcelona,https://cdn.sofifa.org/teams/2/light/241.png,€110.5M,...,96.0,33.0,28.0,26.0,6.0,11.0,15.0,14.0,8.0,€226.5M
1,Cristiano Ronaldo,33.0,https://cdn.sofifa.org/players/4/19/20801.png,Portugal,https://cdn.sofifa.org/flags/38.png,94.0,94,Juventus,https://cdn.sofifa.org/teams/2/light/45.png,€77M,...,95.0,28.0,31.0,23.0,7.0,11.0,15.0,14.0,11.0,€127.1M
2,Neymar Jr,26.0,https://cdn.sofifa.org/players/4/19/190871.png,Brazil,https://cdn.sofifa.org/flags/54.png,92.0,93,Paris Saint-Germain,https://cdn.sofifa.org/teams/2/light/73.png,€118.5M,...,94.0,27.0,24.0,33.0,9.0,9.0,15.0,15.0,11.0,€228.1M
3,De Gea,27.0,https://cdn.sofifa.org/players/4/19/193080.png,Spain,https://cdn.sofifa.org/flags/45.png,91.0,93,Manchester United,https://cdn.sofifa.org/teams/2/light/11.png,€72M,...,68.0,15.0,21.0,13.0,90.0,85.0,87.0,88.0,94.0,€138.6M
4,K. De Bruyne,27.0,https://cdn.sofifa.org/players/4/19/192985.png,Belgium,https://cdn.sofifa.org/flags/7.png,91.0,92,Manchester City,https://cdn.sofifa.org/teams/2/light/10.png,€102M,...,88.0,68.0,58.0,51.0,15.0,13.0,5.0,10.0,13.0,€196.4M


In [228]:
new_data = pd.DataFrame(data["Preferred Foot"])
for column in data.columns:
    if (column in new_data.columns):
        continue
    try:
        new_data[column] = data[column].astype(float)
    except:
        continue

In [229]:
# Nan değerleri doldurma işlemi
from sklearn.impute import SimpleImputer

numeric_data = new_data.select_dtypes(include=[np.number])
imputer = SimpleImputer(missing_values=np.nan, strategy="mean")
imputer = imputer.fit(numeric_data)
new_data[numeric_data.columns] = imputer.transform(numeric_data)
new_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 18206 entries, 0 to 18206
Data columns (total 42 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Preferred Foot            18158 non-null  object 
 1   Age                       18206 non-null  float64
 2   Overall                   18206 non-null  float64
 3   Potential                 18206 non-null  float64
 4   Special                   18206 non-null  float64
 5   International Reputation  18206 non-null  float64
 6   Weak Foot                 18206 non-null  float64
 7   Skill Moves               18206 non-null  float64
 8   Crossing                  18206 non-null  float64
 9   Finishing                 18206 non-null  float64
 10  HeadingAccuracy           18206 non-null  float64
 11  ShortPassing              18206 non-null  float64
 12  Volleys                   18206 non-null  float64
 13  Dribbling                 18206 non-null  float64
 14  Curve      

In [230]:
# One-Hot Encoding | Burası score değerini 0.001 düşürüyor
from sklearn import preprocessing

# Preferred Foot one-hot encoding işlemi
new_data.dropna(subset=["Preferred Foot"], inplace=True)
ohe = preprocessing.OneHotEncoder(sparse_output=False)
preferred_foot = ohe.fit_transform(new_data["Preferred Foot"].values.reshape(-1, 1))
preferred_foot_df = pd.DataFrame(preferred_foot, columns=ohe.get_feature_names_out(['Preferred Foot']))

# Concat OHE DataFrame
new_data = pd.concat([new_data.reset_index(drop=True), preferred_foot_df.reset_index(drop=True)], axis=1)

# Eklenen datayı kaldırma
new_data.drop("Preferred Foot", axis=1, inplace=True)

In [231]:
# Modeli train test olarak ayırma işlemi.
from sklearn.model_selection import train_test_split

y_value = pd.DataFrame(new_data["Overall"].values)
x_values = new_data.drop(columns="Overall").copy()

x_train, x_test, y_train, y_test = train_test_split(x_values, y_value, test_size=0.33, random_state=23)

In [232]:
# Modeli Multiple Linear Regresyon modeline göre eğitme 
from sklearn.linear_model import LinearRegression

regression = LinearRegression()
regression.fit(x_train, y_train)

y_pred = regression.predict(x_test)

In [233]:
# Model doğruluk analizi
y_pred_df = pd.DataFrame(y_pred.flatten(), columns=['Predicted'])
y_test_df = pd.DataFrame(y_test.values, columns=['Actual'])
pred_and_test_data = pd.concat([y_pred_df, y_test_df], axis=1)

accuracy = regression.score(x_test, y_test)
print("Doğruluk oranı: ", accuracy)
print(pred_and_test_data)

Doğruluk oranı:  0.9264820992143259
      Predicted  Actual
0     65.445244    63.0
1     66.236651    69.0
2     68.100356    70.0
3     82.167460    82.0
4     73.169985    77.0
...         ...     ...
5988  65.324250    68.0
5989  62.546087    60.0
5990  66.053337    67.0
5991  73.483833    75.0
5992  73.741432    74.0

[5993 rows x 2 columns]
