## Section 1: Preparing the data
Taking a peek at the data and making any necessary changes to tidy up the data.

## 1.1 Imports and Uploads
Importing the required dependencies and uploading the dataframe

In [91]:
# All required imports for the project
import pandas as pd
import altair as alt
import numpy as np

# Loading in the data and taking a head at the first 5 rows
df = pd.read_csv('fifa_world_cup.csv')
df.head()

Unnamed: 0,team1,team2,possession team1,possession team2,possession in contest,number of goals team1,number of goals team2,date,hour,category,...,penalties scored team1,penalties scored team2,goal preventions team1,goal preventions team2,own goals team1,own goals team2,forced turnovers team1,forced turnovers team2,defensive pressures applied team1,defensive pressures applied team2
0,QATAR,ECUADOR,42%,50%,8%,0,2,20 NOV 2022,17 : 00,Group A,...,0,1,6,5,0,0,52,72,256,279
1,ENGLAND,IRAN,72%,19%,9%,6,2,21 NOV 2022,14 : 00,Group B,...,0,1,8,13,0,0,63,72,139,416
2,SENEGAL,NETHERLANDS,44%,45%,11%,0,2,21 NOV 2022,17 : 00,Group A,...,0,0,9,15,0,0,63,73,263,251
3,UNITED STATES,WALES,51%,39%,10%,1,1,21 NOV 2022,20 : 00,Group B,...,0,1,7,7,0,0,81,72,242,292
4,ARGENTINA,SAUDI ARABIA,64%,24%,12%,1,2,22 NOV 2022,11 : 00,Group C,...,1,0,4,14,0,0,65,80,163,361


## 1.2 Checking the dataframe for messy data

In [92]:
# Checking to see if any rows are missing data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 88 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   team1                                                  64 non-null     object
 1   team2                                                  64 non-null     object
 2   possession team1                                       64 non-null     object
 3   possession team2                                       64 non-null     object
 4   possession in contest                                  64 non-null     object
 5   number of goals team1                                  64 non-null     int64 
 6   number of goals team2                                  64 non-null     int64 
 7   date                                                   64 non-null     object
 8   hour                                                   64 non-

    All columns have the expected 64 values with no null values

## 1.3 Dropping irrelevant columns from the dataframe

In [93]:
# Dropping the columns as they will not apply and renaming columns with typos 
drops = {'date', 'hour', 'possession in contest'}
renames = {'completed defensive line breaksteam1': 'completed defensive line breaks team1',
          'completed line breaksteam1': 'completed line breaks team1',
          'attempts inside the penalty area  team2': 'attempts inside the penalty area team2',
          'attempts outside the penalty area  team1': 'attempts outside the penalty area team1',
          'attempts outside the penalty area  team2': 'attempts outside the penalty area team2'}
df = df.drop(drops, axis=1)
df = df.rename(columns=renames)

# Duplicating category column for future use
df['category2'] = df.loc[:, 'category']
category2 = df.pop('category2')
df.insert(7, 'category2', category2)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 64 entries, 0 to 63
Data columns (total 86 columns):
 #   Column                                                 Non-Null Count  Dtype 
---  ------                                                 --------------  ----- 
 0   team1                                                  64 non-null     object
 1   team2                                                  64 non-null     object
 2   possession team1                                       64 non-null     object
 3   possession team2                                       64 non-null     object
 4   number of goals team1                                  64 non-null     int64 
 5   number of goals team2                                  64 non-null     int64 
 6   category                                               64 non-null     object
 7   category2                                              64 non-null     object
 8   total attempts team1                                   64 non-

## 1.4 Transforming the dataframe to make it easier to work with

In [94]:
# We are going to loop through all of the items and move all of the team2 columns to the bottom of their corresponding team1 column
# This allows us to work with the data much more easily while answering the questions
list = []
items = []
for column in range(0,86):
    for item in df.iloc[:, column]:
        list.append(item)
    if (column + 1) % 2 == 0:
        items.append(list)
        list = []

data = {'team': items[0], 'possession': items[1], 'number of goals': items[2], 'category': items[3], 'total attempts': items[4], 'conceded': items[5], 'goal inside the penalty': items[6], 'goal outside the penalty': items[7], 'assists': items[8], 'on target attempts': items[9], 'off target attempts': items[10], 'attempts inside the penalty': items[11], 'attempts outside the penalty': items[12], 'left channel': items[13], 'left inside channel': items[14], 'central channel': items[15], 'right inside channel': items[16], 'right channel': items[17], 'total offers to receive': items[18], 'inbehind offers to receive': items[19], 'inbetween offers to receive': items[20], 'infront offers to receive': items[21], 'receptions between midfield and defensive lines': items[22], 'attempted line breaks': items[23], 'completed line breaks': items[24], 'attempted defensive line breaks': items[25], 'completed defensive line breaks': items[26], 'yellow cards': items[27], 'red cards': items[28], 'fouls against': items[29], 'offsides': items[30], 'passes': items[31], 'passes completed': items[32], 'crosses': items[33], 'crosses completed': items[34], 'switches of play completed': items[35], 'corners': items[36], 'free kicks': items[37], 'penalties scored': items[38], 'goal preventions': items[39], 'own goals': items[40], 'forced turnovers': items[41], 'defensive pressures applied': items[42]}
df = pd.DataFrame(data=data)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128 entries, 0 to 127
Data columns (total 43 columns):
 #   Column                                           Non-Null Count  Dtype 
---  ------                                           --------------  ----- 
 0   team                                             128 non-null    object
 1   possession                                       128 non-null    object
 2   number of goals                                  128 non-null    int64 
 3   category                                         128 non-null    object
 4   total attempts                                   128 non-null    int64 
 5   conceded                                         128 non-null    int64 
 6   goal inside the penalty                          128 non-null    int64 
 7   goal outside the penalty                         128 non-null    int64 
 8   assists                                          128 non-null    int64 
 9   on target attempts                         

## Section 2: Question #1
Does strong defensive play lead to more attacking chances?


## 2.1 Looking for obvious patterns
First, we'll sort by number of goals to see if there are any obvious patterns

In [114]:
defense = df[['team', 'forced turnovers', 'completed defensive line breaks']]

teams = defense['team'].unique()
for team in teams[0]:
    defense.loc[defense['team'] == team, 'forced turnovers'].sum()


defense.head()

Unnamed: 0,team,forced turnovers,completed defensive line breaks
0,QATAR,52,4
1,ENGLAND,63,16
2,SENEGAL,63,15
3,UNITED STATES,81,15
4,ARGENTINA,65,25


    I want to look at goal preventions, goals conceded, and forced turnovers compared to total attempts, number of goals, and completed defensive line breaks

    Find each teams average forced turnovers and average line breaks

## 2.2 Creating scatter plot
Let's create some scatter plots to show features direct relationship with goals scored

In [107]:
chart = alt.Chart(defense).mark_circle().encode(
    x='forced turnovers',
    y='completed defensive line breaks'
)
chart

## Section 3: Question #2
Which group was the most difficult to escape in the World Cup?

## Section 4: Question #3
Which team had the best statistical tournament in the World Cup?