In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('fifa21 raw data v2.csv', low_memory = False)
pd.set_option('display.max_columns', None)
display(df.head())
df.info()

In [None]:
"""Check if the column data type is 'object' and determine if it should be converted to numeric"""
for col in df.columns:
    if df[col].dtype == 'object':
        print(f"Column '{col}': {df[col].unique()[:5]}")
        print("-" * 50)

In [None]:
def clean_height(x):
    """Convert height from feet'inches" to centimeters."""
    if pd.isna(x): return x
    x = str(x)

    if 'cm' in x:
        return int(x.replace('cm', '').strip())

    elif "'" in x:
        try:
            parts = x.split("'")
            feet = int(parts[0].strip())
            inches = int(parts[1].replace('"', '').strip())
            return int((feet * 30.48) + (inches * 2.54))
        except:
            return None
    return None

def clean_weight(x):
    """Convert weight from pounds to kilograms."""
    if pd.isna(x): return x
    x = str(x)

    if 'kg' in x:
        return int(x.replace('kg', '').strip())

    elif 'lbs' in x:
        try:
            pounds = int(x.replace('lbs', '').strip())
            return int(pounds * 0.453592)
        except:
            return None
    return None


df['Height_cm'] = df['Height'].apply(clean_height)
df['Weight_kg'] = df['Weight'].apply(clean_weight)

print("-------------------- After Cleaning Height and Weight ------------------")
print(f"Height Sample: {df['Height_cm'].unique()[:5]}")
print(f"Weight Sample: {df['Weight_kg'].unique()[:5]}")

In [None]:
def clean_money(x):
    """ Converts financial strings to integers."""
    if pd.isna(x): return x
    x = str(x).strip()

    x = x.replace('€', '').strip()

    if 'M' in x:
        return int(float(x.replace('M', '').strip())* 1_000_000)

    elif 'K' in x:
        return int(float(x.replace('K', '').strip())* 1_000)

    try:
        return int(x)
    except:
        return 0

financial_cols = ['Value', 'Wage', 'Release Clause']

for col in financial_cols:
    if col in df.columns:
        df[f"{col}_Num_in_EUR"] = df[col].apply(clean_money)

if 'Club' in df.columns:
    df['Club'] = df['Club'].str.replace('\n', '').str.strip()

print("-------------------- After Cleaning Financial Columns ------------------")
print(f"Cleaned Vlaue : {df['Value_Num_in_EUR'].unique()[:5]}")
print(f"Cleaned Wage : {df['Wage_Num_in_EUR'].unique()[:5]}")
print(f"Cleaned Club : {df['Club'].unique()[:5]}")





In [None]:
def check_currency(col_name):
    """ Check if a column contains other currency symbols. """
    if col_name in df.columns:
        non_euro_values = df[~df[col_name].str.contains('€') & df[col_name].notna()]

        if len(non_euro_values) > 0:
            print(f"Column {col_name} contains {len(non_euro_values)} non-Euro currency values")
            display(non_euro_values[col_name].unique())
        
        else:
            print(f"Column {col_name} contains only Euro currency values")
        
check_currency('Value')
check_currency('Wage') 
check_currency('Release Clause')


      
      

In [16]:
"""Check for missing values in specific columns before filling them"""
cols_to_fix = ['Height_cm', 'Weight_kg', 'Club']
print("--- Κενά ΠΡΙΝ το γέμισμα ---")
print(df[cols_to_fix].isnull().sum().sort_values(ascending=False))

--- Κενά ΠΡΙΝ το γέμισμα ---
Height_cm    0
Weight_kg    0
Club         0
dtype: int64


In [18]:
"""Robust imputation for future missing values"""

if 'Height_cm' in df.columns:
    df['Height_cm'] = df['Height_cm'].fillna(df['Height_cm'].mean())

if 'Weight_kg' in df.columns:
    df['Weight_kg'] = df['Weight_kg'].fillna(df['Weight_kg'].mean())

if 'Club' in df.columns:
    df['Club'] = df['Club'].fillna('No Club')
    df.loc[df['Club'] == '', 'Club'] = 'No Club'

print("--- Final Missing Values Check ---")
cols_to_check = ['Height_cm', 'Weight_kg', 'Club']
print(df[cols_to_check].isnull().sum())

--- Final Missing Values Check ---
Height_cm    0
Weight_kg    0
Club         0
dtype: int64


In [19]:
print("---  Age Check ---")
print(df['Age'].describe()) 
print("---  Position Check ---")
target_pos_col = 'Best Position' if 'Best Position' in df.columns else 'Position'

if target_pos_col in df.columns:
    print(f"Checking column: {target_pos_col}")
    unique_positions = df[target_pos_col].unique()
    print(f"Number of unique positions: {len(unique_positions)}")
    print("All positions:", unique_positions)
else:
    print("Position column not found!")

---  Age Check ---
count    18979.000000
mean        25.194109
std          4.710520
min         16.000000
25%         21.000000
50%         25.000000
75%         29.000000
max         53.000000
Name: Age, dtype: float64
---  Position Check ---
Checking column: Best Position
Number of unique positions: 15
All positions: ['RW' 'ST' 'GK' 'CAM' 'LW' 'CB' 'CDM' 'CF' 'CM' 'RB' 'LB' 'LM' 'RM' 'LWB'
 'RWB']


In [21]:
clean_filename = 'fifa21_cleaned.csv'
df.to_csv(clean_filename, index=False)
