## DS Project phase 1
### Mohammad Mahdi Samadi 810101465
### Kasra Ghorbani 810101489

#### Import the required libraries

In [13]:
import pandas as pd

#### Read the CSVs and store them in Pandas DataFrames
The data we downloaded from kaggle had 2 initial CSV files. One was called "raw_wages" which contained unprocessed and sometimes incomplete columns of data. The other was called "wages_cleaned" which contained some new columns of data and also some old columns of data which where sometimes changed. (Salary for example).

In [14]:
df_raw = pd.read_csv("raw_wages.csv")
df_cleaned = pd.read_csv("wages_cleaned.csv")

#### Features provided to us

In [15]:
print("raw dataframe columns:")
print(*df_raw.columns)
print("cleaned dataframe columns:")
print(*df_cleaned.columns)

raw dataframe columns:
Name Club Division Based Nat EU National Caps AT Apps Position Age CR Begins Expires Last Club Last Trans. Fee Salary
cleaned dataframe columns:
Is_top_5_League Based_rich_nation Is_top_ranked_nation EU_National Caps Apps Age Reputation Is_top_prev_club Last_Transfer_Fee Salary


#### Remove Unnecassary columns
We removed 2 types of columns :

- Columns which had missing data : They only had values for the first 9 rows of data (Name, Club, Position, Last_Transfer_Fee)

- Columns repeated across both dataframes (Salary, EU National, Caps, AT Apps, Age)

Then we proceeded to concatenate the 2 dataframes so that we can use the remaining data of both dataframes more easily.



In [16]:
df_cleaned.drop(['Last_Transfer_Fee'], axis=1, inplace=True)
df_raw.drop(['Name', 'Club', 'Position', 'Salary', 'EU National', 'Caps', 'AT Apps', 'Age', "CR"], axis=1, inplace=True)
df = pd.concat([df_raw, df_cleaned], axis=1)

#### Adjust the Last Transfer Fee column :
remove symbols and write all fees in the same fashion

In [17]:
def fix_LTF(df):
    fees = []
    for fee in df['Last Trans. Fee']:
        fee = fee.replace(" ", "")
        
        if fee == "-" or fee == "":
            fee = "0"
            mult = 1
        elif fee[-1] == "K":
            mult = 1000
            fee = fee[:-1]
        elif fee[-1] == "M":
            mult = 1000000
            fee = fee[:-1]
        else:
            mult = 1
        
        fee = fee.replace("€", "").replace("M", "").replace("K", "")
        fees.append(float(fee) * mult)
    
    df['Last Trans. Fee'] = fees
    return df

df = fix_LTF(df)

#### Remove extra spaces from strings

In [18]:
for col in df.select_dtypes(include=[object]).columns:
    df[col] = df[col].apply(lambda x: x.strip())
df = df[df["Expires"] != '-']

#### Convert binary based variables to booleans

In [19]:
boolean_variables = ["Is_top_5_League", "Based_rich_nation", "Is_top_ranked_nation", "EU_National", "Is_top_prev_club"]
for bool_var in boolean_variables:
    df[bool_var] = df[bool_var].apply(lambda x: True if x else False)

In [20]:
df["Caps"] = df["Caps"].astype(int)

#### Lets have a look into our refined Data :

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 40768 entries, 0 to 40790
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   Division              40768 non-null  object 
 1   Based                 40768 non-null  object 
 2   Nat                   40768 non-null  object 
 3   Begins                40768 non-null  object 
 4   Expires               40768 non-null  object 
 5   Last Club             40768 non-null  object 
 6   Last Trans. Fee       40768 non-null  float64
 7   Is_top_5_League       40768 non-null  bool   
 8   Based_rich_nation     40768 non-null  bool   
 9   Is_top_ranked_nation  40768 non-null  bool   
 10  EU_National           40768 non-null  bool   
 11  Caps                  40768 non-null  int32  
 12  Apps                  40768 non-null  int64  
 13  Age                   40768 non-null  float64
 14  Reputation            40768 non-null  float64
 15  Is_top_prev_club      40

In [22]:
df.describe()

Unnamed: 0,Last Trans. Fee,Caps,Apps,Age,Reputation,Salary
count,40768.0,40768.0,40768.0,40768.0,40768.0,40768.0
mean,664610.4,5.146978,126.360086,25.180436,4593.855254,319079.6
std,4422994.0,13.669244,125.971433,5.327834,1164.582876,2006623.0
min,0.0,0.0,0.0,17.0,50.0,180.0
25%,0.0,0.0,12.0,21.0,3885.75,16500.0
50%,0.0,0.0,91.0,25.0,4518.0,44500.0
75%,0.0,3.0,207.0,29.0,5391.0,156000.0
max,186000000.0,200.0,799.0,45.0,10000.0,203478000.0


#### Extracte new meaningful Features from existing ones

In [23]:
import warnings
warnings.filterwarnings("ignore")
df["Begins Year"] = df["Begins"].apply(lambda x: int(x.split("/")[2]))
df["Expires Year"] = df["Expires"].apply(lambda x: int(x.split("/")[2]))
df.drop(["Begins", "Expires"], axis=1, inplace=True)

df["under 30"] = df["Age"].apply(lambda x: True if x < 30 else False)

df["Age Signing Contract"] = df["Age"] - (2024 - df["Begins Year"])
df["Age Signing Contract"] = df["Age Signing Contract"].astype(int)

df["Contract Duration"] = df["Expires Year"] - df["Begins Year"]

df["Games Played"] = df["Apps"] + df["Caps"]

#### Save preprocessed dataset into a csv file

In [24]:
df.to_csv("preprocessed_data.csv")