In [55]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from bs4 import BeautifulSoup
import requests

# Load Transfer History Combined csv

In [5]:
transfer_df = pd.read_csv('../raw_data/transfer_history_combined.csv')

In [6]:
transfer_df.shape

(174355, 13)

In [7]:
transfer_df.head()

Unnamed: 0.1,Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,transfer_period,fee_cleaned,league_name,year,season
0,0,FC Girondins Bordeaux,Zinédine Zidane,20.0,Attacking Midfield,Cannes,£6.30m,in,Summer,6.3,Ligue 1,1992,1992/1993
1,1,FC Girondins Bordeaux,Laurent Croci,27.0,Defensive Midfield,FC Sochaux,?,in,Summer,,Ligue 1,1992,1992/1993
2,2,FC Girondins Bordeaux,Philippe Lucas,28.0,Defensive Midfield,FC Sochaux,?,in,Summer,,Ligue 1,1992,1992/1993
3,3,FC Girondins Bordeaux,Márcio Santos,22.0,Centre-Back,Botafogo,?,in,Summer,,Ligue 1,1992,1992/1993
4,4,FC Girondins Bordeaux,Jean-Francois Daniel,28.0,Central Midfield,Cannes,?,in,Summer,,Ligue 1,1992,1992/1993


In [65]:
transfer_df.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [66]:
transfer_df.head()

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,transfer_period,fee_cleaned,league_name,year,season
0,FC Girondins Bordeaux,Zinédine Zidane,20.0,Attacking Midfield,Cannes,£6.30m,in,Summer,6.3,Ligue 1,1992,1992/1993
1,FC Girondins Bordeaux,Laurent Croci,27.0,Defensive Midfield,FC Sochaux,?,in,Summer,,Ligue 1,1992,1992/1993
2,FC Girondins Bordeaux,Philippe Lucas,28.0,Defensive Midfield,FC Sochaux,?,in,Summer,,Ligue 1,1992,1992/1993
3,FC Girondins Bordeaux,Márcio Santos,22.0,Centre-Back,Botafogo,?,in,Summer,,Ligue 1,1992,1992/1993
4,FC Girondins Bordeaux,Jean-Francois Daniel,28.0,Central Midfield,Cannes,?,in,Summer,,Ligue 1,1992,1992/1993


In [67]:
transfer_df.describe()

Unnamed: 0,age,fee_cleaned,year
count,174301.0,143610.0,174355.0
mean,24.366865,0.849148,2008.668057
std,6.216425,3.820889,7.8416
min,-1776.0,0.0,1992.0
25%,21.0,0.0,2003.0
50%,24.0,0.0,2010.0
75%,27.0,0.0,2015.0
max,117.0,199.8,2021.0


In [69]:
transfer_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 174355 entries, 0 to 174354
Data columns (total 12 columns):
 #   Column              Non-Null Count   Dtype  
---  ------              --------------   -----  
 0   club_name           174355 non-null  object 
 1   player_name         174355 non-null  object 
 2   age                 174301 non-null  float64
 3   position            174352 non-null  object 
 4   club_involved_name  174355 non-null  object 
 5   fee                 174211 non-null  object 
 6   transfer_movement   174355 non-null  object 
 7   transfer_period     159923 non-null  object 
 8   fee_cleaned         143610 non-null  float64
 9   league_name         174355 non-null  object 
 10  year                174355 non-null  int64  
 11  season              174355 non-null  object 
dtypes: float64(2), int64(1), object(9)
memory usage: 16.0+ MB


## League Names

In [70]:
transfer_df.league_name.unique()

array(['Ligue 1', '1 Bundesliga', 'Liga Nos', 'Eredivisie', 'Serie A',
       'Premier Liga', 'Championship', 'Premier League',
       'Primera Division'], dtype=object)

Having the `league_names` can help us segment the data as being country specific

In [85]:
transfer_df.tail()

Unnamed: 0,club_name,player_name,age,position,club_involved_name,fee,transfer_movement,transfer_period,fee_cleaned,league_name,year,season
174350,Rayo Vallecano,Iván Martos,24.0,Centre-Back,UD Almería,"End of loanJun 30, 2021",out,Summer,0.0,Primera Division,2021,2021/2022
174351,Rayo Vallecano,Antoñín,21.0,Centre-Forward,Granada CF,"End of loanJun 30, 2021",out,Summer,0.0,Primera Division,2021,2021/2022
174352,Rayo Vallecano,Fran García,22.0,Left-Back,RM Castilla,"End of loanJun 30, 2021",out,Summer,0.0,Primera Division,2021,2021/2022
174353,Villarreal CF,Santiago Cáseres,24.0,Defensive Midfield,Vélez Sarsfield,"End of loanDec 31, 2021",in,Winter,0.0,Primera Division,2021,2021/2022
174354,Rayo Vallecano,Lass Bangoura,29.0,Right Winger,CS Emelec,"End of loanDec 31, 2021",in,Winter,0.0,Primera Division,2021,2021/2022


## What should we do with `Nan` values in the `fee_cleaned` column?

In [73]:
transfer_df.fee_cleaned.isnull().sum()

30745

In [74]:
transfer_df.fee_cleaned.isnull().sum()/df.shape[0]

0.1763356370623154

We have a 17.6% of null values in the `fee_cleaned` column

# Load the FIFA dataset

In [8]:
fifa_df = pd.read_csv('../raw_data/players_combined.csv')

In [9]:
fifa_df.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [10]:
fifa_df.head(2)

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,fifa year
0,158023,https://sofifa.com/player/158023/lionel-messi/...,L. Messi,Lionel Andrés Messi Cuccittini,27,1987-06-24,169,67,Argentina,FC Barcelona,...,62+3,62+3,62+3,62+3,54+3,45+3,45+3,45+3,54+3,2015
1,20801,https://sofifa.com/player/20801/c-ronaldo-dos-...,Cristiano Ronaldo,Cristiano Ronaldo dos Santos Aveiro,29,1985-02-05,185,80,Portugal,Real Madrid,...,63+3,63+3,63+3,63+3,57+3,52+3,52+3,52+3,57+3,2015


In [105]:
type(fifa_df.sofifa_id[0])

numpy.int64

# Match both columns

get a list of all the player names in the `transfer_df`

In [19]:
player_names = transfer_df.player_name

run a for loop with all the `player_names` on the `fifa_df` if the name matches keep a count

In [13]:
fifa_df[fifa_df['long_name'].str.contains('Iván Martos')]

Unnamed: 0,sofifa_id,player_url,short_name,long_name,age,dob,height_cm,weight_kg,nationality,club_name,...,ldm,cdm,rdm,rwb,lb,lcb,cb,rcb,rb,fifa year
95507,246279,https://sofifa.com/player/246279/ivan-martos-c...,Iván Martos,Iván Martos Campillo,22,1997-05-15,182,68,Spain,UD Almería,...,60+2,60+2,60+2,64+2,64+2,63+2,63+2,63+2,64+2,2020
109473,246279,https://sofifa.com/player/246279/ivan-martos-c...,Iván Martos,Iván Martos Campillo,23,1997-05-15,182,68,Spain,UD Almería,...,61+2,61+2,61+2,66+2,67+2,66+2,66+2,66+2,67+2,2021


In [27]:
fifa_long_names = fifa_df.long_name

In [28]:
fifa_long_names

0              Lionel Andrés Messi Cuccittini
1         Cristiano Ronaldo dos Santos Aveiro
2                                Arjen Robben
3                          Zlatan Ibrahimović
4                                Manuel Neuer
                         ...                 
122836                           Kevin Angulo
122837                         Mengxuan Zhang
122838                                    王政豪
122839                            Zitong Chen
122840                               Yue Song
Name: long_name, Length: 122841, dtype: object

In [24]:
player_names

0              Zinédine Zidane
1                Laurent Croci
2               Philippe Lucas
3                Márcio Santos
4         Jean-Francois Daniel
                  ...         
174350             Iván Martos
174351                 Antoñín
174352             Fran García
174353        Santiago Cáseres
174354           Lass Bangoura
Name: player_name, Length: 174355, dtype: object

In [47]:
player_names[0].split()[0]

'Zinédine'

---

### Which clubs have made the most money out of transfers?

This value might be a little skewed has I have summed all values. We need to only sum the `in`

----
## WORKING WITH THE `NOT-MATCHING` CSV FILE

In [2]:
no_match_df = pd.read_csv('../raw_data/not_matching_names.csv')

In [3]:
no_match_df

Unnamed: 0.1,Unnamed: 0,name
0,0,Ángel Di María
1,5,Sergi Darder
2,9,Rafael
3,11,Olivier Kemen
4,12,Ivan Cavaleiro
...,...,...
1818,3910,Iván Marcone
1819,3911,Lucas Boyé
1820,3912,Matthew Hoppe
1821,3913,Dominik Greif


In [52]:
no_match_df.drop(labels='Unnamed: 0', axis=1, inplace=True)

In [127]:
no_match_df['sofifa_id'] = 0

In [128]:
no_match_df

Unnamed: 0,name,sofifa_id
0,Ángel Di María,0
1,Sergi Darder,0
2,Rafael,0
3,Olivier Kemen,0
4,Ivan Cavaleiro,0
...,...,...
1818,Iván Marcone,0
1819,Lucas Boyé,0
1820,Matthew Hoppe,0
1821,Dominik Greif,0


# Using Beautiful Soup to scrape players sofifa ID's

In [85]:
url = "https://sofifa.com/players"
params = {
    'keyword': 'Angel Di Maria'
}
response = requests.get(url, params=params)

In [86]:
soup = BeautifulSoup(response.content, 'html.parser')

In [87]:
soup

<!DOCTYPE html>

<html lang="en-US">
<head><title>Players FIFA 22 Nov 23, 2021 SoFIFA</title>
<meta charset="utf-8"/>
<meta content="width=device-width, initial-scale=1" name="viewport"/>
<meta content="VGaFvm9Qm-qhCd2LCDi8QKWXNPSrVE2SJ2iT8c3vI1g" name="google-site-verification"/>
<meta content="FIFA 22 player ratings on Nov 23, 2021" name="description"/>
<meta content="Khachin Borjigin" name="author"/>
<meta content="2009 - 2021 SoFIFA.com" name="copyright"/>
<meta content="light" name="twitter:widgets:theme"/>
<meta content="#e6e6e6" name="twitter:widgets:border-color"/>
<link href="/launcher-icon.png" rel="shortcut icon"/>
<link href="/launcher-icon.png" rel="apple-touch-icon"/>
<link href="https://cdn.sofifa.net" rel="preconnect"/>
<link href="https://cdn.sofifa.net" rel="dns-prefetch"/>
<link href="https://config.playwire.com/" rel="preconnect"/>
<link href="https://config.playwire.com/" rel="dns-prefetch"/>
<link href="https://cdn.intergient.com/" rel="preconnect"/>
<link href="h

In [93]:
soup.find("img", class_="player-check").get('id')

'183898'

In [109]:
no_match_df['name'][3]

'Olivier Kemen'

In [136]:
for idx, name in enumerate(no_match_df['name']):
    params = {
        'keyword': name
    }
    response = requests.get(url, params=params)
    soup = BeautifulSoup(response.content, 'html.parser')
    try:
        sofifa_id = soup.find("img", class_="player-check").get('id')
        no_match_df['sofifa_id'][idx] = sofifa_id
    except AttributeError:
        no_match_df['sofifa_id'][idx] = 'Nan'

In [132]:
no_match_df.head(50)

Unnamed: 0,name,sofifa_id
0,Ángel Di María,183898
1,Sergi Darder,202648
2,Rafael,216547
3,Olivier Kemen,220175
4,Ivan Cavaleiro,212267
5,Rony Lopes,212692
6,Guido Carrillo,215334
7,Fabinho,209499
8,Gil Dias,229453
9,Lucas Ocampos,205632


In [140]:
no_match_df[no_match_df['sofifa_id'] == '210514']

Unnamed: 0,name,sofifa_id
260,Pedro,210514
325,João Cancelo,210514
1023,João Pedro,210514


In [138]:
no_match_df['sofifa_id'].value_counts().head(30)

Nan       514
210514      3
202651      3
212198      2
236791      2
205943      2
230666      2
226790      2
184134      2
189509      2
241184      2
236499      2
231866      2
20801       2
221639      2
218667      2
210257      2
216466      2
199042      2
178005      2
216352      2
191043      2
198950      2
212814      2
224458      2
247182      2
209499      2
205498      2
201942      2
247851      1
Name: sofifa_id, dtype: int64

 * why is it getting the number <u>205632</u> for so many players?
 * most of them dont even have similar names


----

**Whats the next step?** 
 * once the `sofifa_id`'s are linked to each player name, we could merge **`no_match_df`** with **`transfer_df`** on the `name and player_name` and use the `sofifa_id` to merge the **`fifa_df`**
 * The `name` in **`no_match_df`** has been retrieved from the **`transfer_df`**
 * the `sofifa_id` can be linked with the **`fifa_df`**

----
 * How to proceed with players whose name searching did not return any values?
 * There are cases where the the returned value is Nan but the player exists in the `SOFIFA DB`
 * Douglas Costa is one of those cases
 * https://sofifa.com/players?keyword=douglas+costa is an empty page with no results
 * looking for the player manually we find https://sofifa.com/player/190483/douglas-costa-de-souza/210047
 * which is the player information, but the url is too convoluted to search for it via the current method
 * possible solution: selenium webdriver
 *

---
 * second case is Georges-Kevin N'Koudou
 * as the name is currently spelled, the `SOFIFA DB` doesnt find the player
 * its name is found via Georges-Kevin NKoudou (without the apostrophe)

----
