# Data Import 

### Import Libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

### Initial Data Exploration for our GBBO Dataset (Head, Shape, Info, Data Types & Describe)

In [2]:
GBBO_df = pd.read_csv('data/GBBO_data_set.csv')

In [3]:
#Remove those last 2 weird columns.
GBBO_df=GBBO_df.iloc[:,:16]

In [4]:
GBBO_df.head()

Unnamed: 0,Season,Judge,Week Number,Week Name,Baker,Gender,Age,Signature Handshake,Technical Rank,Showstopper Handshake,Favorite,Least Favorite,Star Baker,Eliminated,Competed,Winner
0,Series 1,Mary,1,Cake,Annetha,F,30,0,2.0,0,1.0,0,0,0,1,0
1,Series 1,Mary,1,Cake,David,M,31,0,3.0,0,0.0,1,0,0,1,0
2,Series 1,Mary,1,Cake,Edd,M,24,0,1.0,0,0.0,0,0,0,1,1
3,Series 1,Mary,1,Cake,Jasminder,F,45,0,,0,0.0,0,0,0,1,0
4,Series 1,Mary,1,Cake,Jonathan,M,25,0,9.0,0,0.0,0,0,0,1,0


In [5]:
GBBO_df.shape

(1256, 16)

In [6]:
GBBO_df.info()
#notice several missing values for Technical Rank.

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1256 entries, 0 to 1255
Data columns (total 16 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Season                 1256 non-null   object 
 1   Judge                  1256 non-null   object 
 2   Week Number            1256 non-null   int64  
 3   Week Name              1256 non-null   object 
 4   Baker                  1256 non-null   object 
 5   Gender                 1256 non-null   object 
 6   Age                    1256 non-null   int64  
 7   Signature Handshake    1256 non-null   int64  
 8   Technical Rank         771 non-null    float64
 9   Showstopper Handshake  1256 non-null   int64  
 10  Favorite               1256 non-null   float64
 11  Least Favorite         1256 non-null   int64  
 12  Star Baker             1256 non-null   int64  
 13  Eliminated             1256 non-null   int64  
 14  Competed               1256 non-null   int64  
 15  Winn

In [7]:
GBBO_df.describe()

Unnamed: 0,Week Number,Age,Signature Handshake,Technical Rank,Showstopper Handshake,Favorite,Least Favorite,Star Baker,Eliminated,Competed,Winner
count,1256.0,1256.0,1256.0,771.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0,1256.0
mean,5.328025,37.449045,0.024682,4.837873,0.003185,0.164013,0.170382,0.078025,0.095541,0.619427,0.082803
std,2.830237,12.841254,0.155214,2.967953,0.056366,0.369896,0.376118,0.268319,0.294078,0.485721,0.275693
min,1.0,17.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,29.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5.0,33.5,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
75%,8.0,46.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
max,10.0,71.0,1.0,13.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [8]:
GBBO_df.dtypes

Season                    object
Judge                     object
Week Number                int64
Week Name                 object
Baker                     object
Gender                    object
Age                        int64
Signature Handshake        int64
Technical Rank           float64
Showstopper Handshake      int64
Favorite                 float64
Least Favorite             int64
Star Baker                 int64
Eliminated                 int64
Competed                   int64
Winner                     int64
dtype: object

#### Renaming Columns and getting data types

In [9]:
list(GBBO_df.columns)

['Season',
 'Judge',
 'Week Number',
 'Week Name',
 'Baker',
 'Gender',
 'Age',
 'Signature Handshake',
 'Technical Rank',
 'Showstopper Handshake',
 'Favorite',
 'Least Favorite',
 'Star Baker',
 'Eliminated',
 'Competed',
 'Winner']

In [10]:
#I will now rename the columns to be a bit easier to program with
GBBO_df.rename(columns = {'Week Number':'Week_Number',
                          'Week Name':'Week_Name', 
                          'Signature Handshake':'Signature_Handshake',
                          'Technical Rank':'Technical_Rank',
                          'Showstopper Handshake':'Showstopper_Handshake',
                          'Least Favorite':'Least_Favorite',
                          'Star Baker':'Star_Baker'},
              inplace = True)

#### Initial Summary on Data

In [11]:
Unique_Bakers = GBBO_df.Baker.nunique()
Gender_Counts = GBBO_df.Gender.value_counts()
Num_Sig_Handshakes = GBBO_df.Signature_Handshake.sum()
Num_Show_Handshakes = GBBO_df.Showstopper_Handshake.sum()
Youngest_Baker = GBBO_df.Age.min()
Oldest_Baker = GBBO_df.Age.max()


print(f' Over 11 seasons, The Great British Bake Off has seen {Unique_Bakers} bakers, \
{Gender_Counts[0]} of whom identify as female and {Gender_Counts[1]} whom identify as male. \
The series has also seen bakers range in ages from the youngest of {Youngest_Baker} years old, \
to the oldest of {Oldest_Baker} years old. Paul, one of the judges, loves to give out handshakes after exceptional bakes and \
we have seen {Num_Sig_Handshakes} number of handshakes after Signature Challenges, \
and {Num_Show_Handshakes} after Showstopper Challenges.')

 Over 11 seasons, The Great British Bake Off has seen 118 bakers, 644 of whom identify as female and 612 whom identify as male. The series has also seen bakers range in ages from the youngest of 17 years old, to the oldest of 71 years old. Paul, one of the judges, loves to give out handshakes after exceptional bakes and we have seen 31 number of handshakes after Signature Challenges, and 4 after Showstopper Challenges.


### Data Manipulation for GBBO Data Set

1. Season one operated differently than the rest of the seasons. We had multiple contestants leave each week, handshakes weren't a thing, and we don't have a full list of technical scores. We will remove season 1. 
2. We will convert Season and Week_Number to category data types.
3. After further data exploration, we've discovered that the baker 'Brenden' was spelled 'Brendan'. 

#### Clean up 'Series' feature (Remove season 1, convert the data type to a category)

In [12]:
#Remove Season 1
df = GBBO_df.loc[GBBO_df.Season != 'Series 1']
print(df.Season.unique())
print(df.Season.dtype)

['Series 2' 'Series 3' 'Series 4' 'Series 5' 'Series 6' 'Series 7'
 'Series 8' 'Series 9' 'Series 10' 'Series 11']
object


In [13]:
#Remove "Series" from the name 
df.Season = df.loc[:,'Season'].str.replace("Series ","")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Season = df.loc[:,'Season'].str.replace("Series ","")


In [14]:
df.loc[:,'Season']

60       2
61       2
62       2
63       2
64       2
        ..
1251    11
1252    11
1253    11
1254    11
1255    11
Name: Season, Length: 1196, dtype: object

In [15]:
#and convert the data type to a category.Warning is okay because we are not using a copy
df.Season = df.loc[:,'Season'].astype('int').astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Season = df.loc[:,'Season'].astype('int').astype('category')


#### Clean up "Week_Number" (convert the data type to a category)

In [16]:
#convert to a category. Warning is okay because wea re not using a copy
df.Week_Number = df.loc[:,'Week_Number'].astype('category')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.Week_Number = df.loc[:,'Week_Number'].astype('category')


#### Fix Brenden

In [17]:
#Brenden's name is spelled incorrectly. Should be 'Brendan'
df.loc[:,'Baker'].replace('Brenden','Brendan',inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df.loc[:,'Baker'].replace('Brenden','Brendan',inplace=True)


### Initial Data Exploration for our Challenges Dataset (Head, Shape, Info, Data Types & Describe)

In [18]:
url = 'https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2022/2022-10-25/challenges.csv'

In [19]:
challenges = pd.read_csv(url)

In [20]:
challenges.head()

Unnamed: 0,series,episode,baker,result,signature,technical,showstopper
0,1,1,Annetha,IN,Light Jamaican Black Cakewith Strawberries and...,2.0,"Red, White & Blue Chocolate Cake with Cigarell..."
1,1,1,David,IN,Chocolate Orange Cake,3.0,Black Forest Floor Gateaux with Moulded Chocol...
2,1,1,Edd,IN,Caramel Cinnamon and Banana Cake,1.0,
3,1,1,Jasminder,IN,Fresh Mango and Passion Fruit Hummingbird Cake,,
4,1,1,Jonathan,IN,Carrot Cake with Lime and Cream Cheese Icing,9.0,Three Tiered White and Dark Chocolate with Alm...


In [21]:
challenges.shape

(1136, 7)

In [22]:
challenges.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1136 entries, 0 to 1135
Data columns (total 7 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   series       1136 non-null   int64  
 1   episode      1136 non-null   int64  
 2   baker        1136 non-null   object 
 3   result       710 non-null    object 
 4   signature    703 non-null    object 
 5   technical    696 non-null    float64
 6   showstopper  688 non-null    object 
dtypes: float64(1), int64(2), object(4)
memory usage: 62.2+ KB


In [23]:
challenges.describe()

Unnamed: 0,series,episode,technical
count,1136.0,1136.0,696.0
mean,5.838028,5.309859,4.843391
std,2.734378,2.825143,2.976147
min,1.0,1.0,1.0
25%,4.0,3.0,2.0
50%,6.0,5.0,4.0
75%,8.0,8.0,7.0
max,10.0,10.0,13.0


In [24]:
challenges.dtypes

series           int64
episode          int64
baker           object
result          object
signature       object
technical      float64
showstopper     object
dtype: object

### Data Manipulation for Challenges Data Set

1. Remove Season 1
2. Clean up column names to match GBBO dataset.
3. Remove season 11 from the challenges dataset.
4. Change Season and Week_Number to 'category' datatype to match the GBBO dataset.

In [25]:
#remove series 1 to match our existing DF
challenges=challenges[challenges.series != 1]

In [26]:
#Change col labels of the challenges dataset
challenges.rename(columns={"series":"Season","episode":"Week_Number","baker":"Baker"},inplace=True)

In [27]:
#Challenges only has data for the seasons 2-10 while our main df goes to season 11. I'll drop season 11 from our main df. 
df= df.loc[df.Season != 11]
df.Season.unique()

[2, 3, 4, 5, 6, 7, 8, 9, 10]
Categories (10, int64): [2, 3, 4, 5, ..., 8, 9, 10, 11]

In [28]:
#Change datatypes to match the GBBO datset
challenges.Season=challenges.loc[:,'Season'].astype('category')
challenges.Week_Number=challenges.loc[:,'Week_Number'].astype('category')

### Merging the datasets and doing further data Wrangling (dealing with nulls)

1. Merge the dataframes on Season, Week Number and Baker.
2. Remove un-necessary / redundant Colummns (result, technical, and judge).
3. Remove rows where bakers have been eliminated.
4. check and handle null values

In [29]:
#1.merge
final_df = pd.merge(df,challenges,how = 'left', on=['Season','Week_Number','Baker'])


In [30]:
#2.drop challenges.result column & technical column due to redundancy. drop judge column since Mary won't be a judge moving forward 
final_df.drop(columns=['result','technical','Judge'],inplace=True)

In [33]:
#3.dropping competed. 
final_df.drop(columns='Competed',inplace=True)

In [34]:
#4.investigate nulls
final_df.isnull().sum()

Season                    0
Week_Number               0
Week_Name                 0
Baker                     0
Gender                    0
Age                       0
Signature_Handshake       0
Technical_Rank            4
Showstopper_Handshake     0
Favorite                  0
Least_Favorite            0
Star_Baker                0
Eliminated                0
Winner                    0
signature                 3
showstopper              15
dtype: int64

#### Manually investigate technical_rank nulls

In [35]:
#4.delete the wrongfully inputted value for robert. He was eliminated back in week 5
delete = final_df[final_df.Technical_Rank.isna()].head(1)
final_df.drop(delete.index,inplace=True)

In [36]:
#4.delete the wrongfully inputted value for Peter. He was eliminated back in week 1
delete = final_df[final_df.Technical_Rank.isna()].head(2).iloc[[1]]
final_df.drop(delete.index,inplace=True)

In [37]:
#4.Tamal got 3rd in the competition
final_df.loc[584,'Technical_Rank'] = 3

In [38]:
#4.delete the wrongfully inputted value for Terry. He was eliminated back in week 5
delete = final_df[final_df.Technical_Rank.isna()]
final_df.drop(delete.index,inplace=True)

#### Manually investigate showstopper nulls

In [39]:
final_df[final_df.showstopper.isna()]

Unnamed: 0,Season,Week_Number,Week_Name,Baker,Gender,Age,Signature_Handshake,Technical_Rank,Showstopper_Handshake,Favorite,Least_Favorite,Star_Baker,Eliminated,Winner,signature,showstopper
2,2,1,Cake,Ian,M,40,0,10.0,0,0.0,0,0,0,0,Apple and Cinnamon Cupcakes,
4,2,1,Cake,Jason,M,19,0,6.0,0,0.0,0,0,0,0,Lemon Meringue CupcakesApple and Cinnamon Cupc...,
10,2,1,Cake,Urvashi,F,40,0,7.0,0,0.0,0,0,0,0,Cherry Blossom CupcakesJapanese Lime Cupcakes,
11,2,1,Cake,Yasmin,F,43,0,5.0,0,0.0,0,0,0,0,Cardamom and Pomegranate Cupcakes,
14,2,2,Tarts,Ian,M,40,0,2.0,0,1.0,0,0,0,0,"Stilton, Spinach and New Potato Quichewith Pap...",
24,2,3,Bread,Ben,M,31,0,3.0,0,0.0,0,0,0,0,"Walnut, Raisin and Rosemary Loaf",
28,2,3,Bread,Jason,M,19,0,7.0,0,0.0,0,0,0,0,Cheese and Onion Tear and Share Loaf,
29,2,3,Bread,Joanne,F,41,0,1.0,0,0.0,0,0,0,1,"Stromboli flavored with Mozzarella, Ham, and P...",
110,3,2,Bread,Danny,F,45,0,3.0,0,0.0,0,0,0,0,Lime Coriander and Coconut TortillasZaatar Naa...,
113,3,2,Bread,Manisha,F,27,0,6.0,0,0.0,0,0,0,0,Indian FlatbreadsItalian Flatbreads,


#### Using Wikipedia, We can fill in the bakes for Urvashi(Season 2, episode 1) and Manisha (Season 3, episode 2). The remaining NAs are listed as "unknown" on wikipedia. I'll fill the missing values with "Unknown"

In [40]:
#4.fix known missing values
final_df.loc[10,'showstopper'] = 'Orange Blossom Celebration Cake'
final_df.loc[113,'showstopper'] = 'Chocolate Orange Bagels'

In [41]:
#4.fill the remaining nulls with "Unkown"
final_df.showstopper.fillna("Unknown",inplace=True)

In [42]:
#4.Make sure we have no nulls!
final_df.isnull().sum()

Season                   0
Week_Number              0
Week_Name                0
Baker                    0
Gender                   0
Age                      0
Signature_Handshake      0
Technical_Rank           0
Showstopper_Handshake    0
Favorite                 0
Least_Favorite           0
Star_Baker               0
Eliminated               0
Winner                   0
signature                0
showstopper              0
dtype: int64

In [43]:
final_df.dtypes

Season                      int64
Week_Number              category
Week_Name                  object
Baker                      object
Gender                     object
Age                         int64
Signature_Handshake         int64
Technical_Rank            float64
Showstopper_Handshake       int64
Favorite                  float64
Least_Favorite              int64
Star_Baker                  int64
Eliminated                  int64
Winner                      int64
signature                  object
showstopper                object
dtype: object

In [47]:
# After the merge, the datatpyes changed. 
final_df= final_df.astype({'Season':'category','Week_Name':'category','Baker':'category','Gender':'category','Technical_Rank':'int64'})


In [50]:
final_df.dtypes

Season                   category
Week_Number              category
Week_Name                category
Baker                    category
Gender                   category
Age                         int64
Signature_Handshake         int64
Technical_Rank              int64
Showstopper_Handshake       int64
Favorite                  float64
Least_Favorite              int64
Star_Baker                  int64
Eliminated                  int64
Winner                      int64
signature                  object
showstopper                object
dtype: object

In [54]:
final_df.to_csv('data/final_df.csv')