# Project details: FIFA - MoneyBall (Data Exploration)

<img src="https://bsmedia.business-standard.com/_media/bs/img/article/2022-12/16/full/1671173841-0538.jpg?im=FeatureCrop,size=(826,465)" alt="drawing" width="900" height='500'/>

### The challenge
Perform an end-to-end analysis putting into practice what you have learned so far. You will apply statistical or machine learning techniques and present your results to the class.

### Objectives
- Ask interesting and thoughtful questions and find the data to answer them.
- Focus on improving in areas that are hard for you or learning more about something with which you feel comfortable.
- Apply the statistical and machine learning techniques we have learned.
- Create useful and clear graphs.
- Present your insights in a thoughtful, clear, and accurate way.

### Possible Outcomes
- Rank players by market value.
- Highlight the top players for their outstanding performances over a discrete season.
- Decide when to transfer a player.
- Decide the best replacement for a transferred player.



In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Load the dataset
data = pd.read_csv("Fifa 23 Players Data.csv")

In [None]:
#TO-DO: Take a look at Data to confirm you loaded it correctly

### Data Cleaning


In [None]:
#TO-DO:
#1. Make columns lower-case:
#2. Remove space

#3. Check DataTypes

In [None]:
# Drop irrelevant columns
columns_to_drop = ['real_face', 'player_url', 'loaned_from', 'nation_position', 'nation_jersey_number']
data.drop(columns=columns_to_drop, inplace=True)

In [None]:
#TO-DO: Handle missing values

In [None]:
#TO-DO: Check Duplicates

### Data Exploration

#### EDA

In [None]:
# Summary statistics
summary_stats = data.describe()
summary_stats

Visualizations:

In [None]:
#Distribution of player attributes
attributes = ['overall', 'age', 'value(in_euro)', 'wage(in_euro)', 'height(in_cm)', 'weight(in_kg)']
plt.figure(figsize=(12, 8))
for i, attribute in enumerate(attributes, 1):
    plt.subplot(2, 3, i)
    sns.histplot(data[attribute], kde=True)
    plt.title(f'Distribution of {attribute}')
plt.suptitle('Distribution of player attributes')
plt.tight_layout()
plt.show()

In [None]:
# Market value analysis
plt.figure(figsize=(10, 6))
sns.scatterplot(x='overall', y='value(in_euro)', data=data, alpha=0.5)
plt.title('Player Overall vs Market Value')
plt.xlabel('Overall Rating')
plt.ylabel('Market Value (EUR)')
plt.show()

In [89]:
# TO-DO: Add a Categorical Plot

Correlation Analysis

In [None]:
# Correlation analysis
corr_matrix = data.corr()
plt.figure(figsize=(10, 8))
sns.heatmap(corr_matrix, annot=False, cmap='coolwarm')
plt.title('Correlation Matrix')
plt.show()

### Data Analysis 

#### 1. Rank players by market value

In [None]:
# We can simply sort the dataset by 'value_eur' column to get the ranking
ranked_players = data.sort_values(by='value(in_euro)', ascending=False)

### 2. Highlight the top players for their outstanding performances over a discrete season

In [None]:
# Define a composite performance metric
data['performance_score'] = data['overall'] + data['pace_total'] + data['shooting_total'] + data['passing_total'] + data['dribbling'] + data['defending_total'] + data['physicality_total']

### Data Modeling

In [None]:
# Machine Learning Models (example: predicting player market value)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

# Selecting features and target
X = data[['overall', 'age', 'potential', 'wage(in_euro)']]
y = data['value(in_euro)']

# Splitting the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Model training
model = LinearRegression()
model.fit(X_train, y_train)

# Model evaluation
train_preds = model.predict(X_train)
test_preds = model.predict(X_test)

train_rmse = np.sqrt(mean_squared_error(y_train, train_preds))
test_rmse = np.sqrt(mean_squared_error(y_test, test_preds))

print(f'Training RMSE: {train_rmse}')
print(f'Test RMSE: {test_rmse}')

# Conclusion
# Summarize findings and insights
