In [1]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import tensorflow as tf
import numpy as np


#  Import and read the data.csv.
import pandas as pd 
data_df = pd.read_csv("archive/data.csv")
#data_df.head()

In [2]:
#Drop unnecessary columns
data_df = data_df.drop(["Unnamed: 0","Unnamed: 0.1", "ID", "Name", "Team", "Games","Year","City", "Event", "Medal", "Host", "Nat_Pop", "Global_Pop%", "Rural_Pop%", "Life_Expect", "GDP", "GDP_Per_Cap", "Global_GDP%"], 1)

#Drop NaN values
data_df = data_df.dropna()

data_df.head(3)

Unnamed: 0,Sex,Age,Height,Weight,NOC,Season,Sport
0,M,24.0,180.0,80.0,CHN,Summer,Basketball
1,M,23.0,170.0,60.0,CHN,Summer,Judo
2,F,21.0,185.0,82.0,NLD,Winter,Speed Skating


In [3]:
#Filter out season data to just summer
data_df = data_df[data_df["Season"].str.contains("Winter") == False]

In [4]:
#Drop Season since no longer needed after filtering
data_df = data_df.drop(["Season"],1)
data_df.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport
0,M,24.0,180.0,80.0,CHN,Basketball
1,M,23.0,170.0,60.0,CHN,Judo
26,M,31.0,172.0,70.0,FIN,Badminton
29,M,31.0,189.0,130.0,FIN,Athletics
51,F,22.0,170.0,125.0,ROU,Weightlifting


In [5]:
# Create list of sports
sports = data_df['Sport'].unique()
print(sports)

['Basketball' 'Judo' 'Badminton' 'Athletics' 'Weightlifting' 'Wrestling'
 'Rowing' 'Swimming' 'Football' 'Equestrianism' 'Shooting' 'Gymnastics'
 'Taekwondo' 'Boxing' 'Fencing' 'Diving' 'Canoeing' 'Handball'
 'Water Polo' 'Tennis' 'Cycling' 'Hockey' 'Softball' 'Archery'
 'Volleyball' 'Synchronized Swimming' 'Modern Pentathlon' 'Table Tennis'
 'Baseball' 'Rhythmic Gymnastics' 'Rugby Sevens' 'Trampolining'
 'Beach Volleyball' 'Triathlon' 'Golf']


In [6]:
# create list for each sport
sport_list = {sport:[] for sport in sports}
sport_list

{'Basketball': [],
 'Judo': [],
 'Badminton': [],
 'Athletics': [],
 'Weightlifting': [],
 'Wrestling': [],
 'Rowing': [],
 'Swimming': [],
 'Football': [],
 'Equestrianism': [],
 'Shooting': [],
 'Gymnastics': [],
 'Taekwondo': [],
 'Boxing': [],
 'Fencing': [],
 'Diving': [],
 'Canoeing': [],
 'Handball': [],
 'Water Polo': [],
 'Tennis': [],
 'Cycling': [],
 'Hockey': [],
 'Softball': [],
 'Archery': [],
 'Volleyball': [],
 'Synchronized Swimming': [],
 'Modern Pentathlon': [],
 'Table Tennis': [],
 'Baseball': [],
 'Rhythmic Gymnastics': [],
 'Rugby Sevens': [],
 'Trampolining': [],
 'Beach Volleyball': [],
 'Triathlon': [],
 'Golf': []}

In [7]:
# Populate sport lists
for sport in sports:
    for value in data_df['Sport']:
        if value == sport:
            sport_list[sport].append(True),
        else:
            sport_list[sport].append(False)


In [8]:
# check length of lists
len(sport_list['Judo'])

152696

In [9]:
# check length of df to make sure list has appended
len(data_df['Sport'])

152696

In [10]:
# Add column for each sport
for sport in sports:
    data_df[sport] = sport_list[sport]


data_df.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Basketball,Judo,Badminton,Athletics,...,Synchronized Swimming,Modern Pentathlon,Table Tennis,Baseball,Rhythmic Gymnastics,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,M,24.0,180.0,80.0,CHN,Basketball,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,M,23.0,170.0,60.0,CHN,Judo,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
26,M,31.0,172.0,70.0,FIN,Badminton,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
29,M,31.0,189.0,130.0,FIN,Athletics,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
51,F,22.0,170.0,125.0,ROU,Weightlifting,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [11]:
# Drop Sport Column
data_df = data_df.drop(['Sport'],1)

In [12]:
# Seperate mens and womens sports
male_df = data_df[data_df['Sex'] == "M"]
female_df = data_df[data_df['Sex'] == "F"]

In [13]:
# Drop Sex column
male_df = male_df.drop(["Sex"],1)
female_df = female_df.drop(["Sex"],1)

In [14]:
# Drop the single sex sports from respective dataframes
male_df = male_df.drop(["Softball", "Synchronized Swimming", "Rhythmic Gymnastics"],1)
female_df = female_df.drop(["Baseball"],1)

In [15]:
male_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Archery,Volleyball,Modern Pentathlon,Table Tennis,Baseball,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,24.0,180.0,80.0,CHN,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,23.0,170.0,60.0,CHN,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
26,31.0,172.0,70.0,FIN,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
29,31.0,189.0,130.0,FIN,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
52,22.0,187.0,89.0,NOR,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [16]:
#Begin binning data to make it easier to get dummies
#Find min and max of data types to create bins
male_df['Age'] = male_df['Age'].astype(float, errors = 'raise')

mMin_age = male_df['Age'].min()
mMax_age = male_df['Age'].max()
print(mMin_age)
print(mMax_age)

#Create bins with min and max
mAge_bins = np.linspace(mMin_age,mMax_age,10)
mAge_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Age"] = pd.cut(male_df["Age"], mAge_bins, labels=mAge_bin_names)

12.0
71.0


In [17]:
#Find min and max of data types to create bins
male_df['Height'] = male_df['Height'].astype(float, errors = 'raise')

mMin_height = male_df['Height'].min()
mMax_height = male_df['Height'].max()
print(mMin_height)
print(mMax_height)

#Create bins with min and max
mHeight_bins = np.linspace(mMin_height,mMax_height,10)
mHeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Height"] = pd.cut(male_df["Height"], mHeight_bins, labels=mHeight_bin_names)

127.0
226.0


In [18]:
#Find min and max of data types to create bins
male_df['Weight'] = male_df['Weight'].astype(float, errors = 'raise')

mMin_Weight = male_df['Weight'].min()
mMax_Weight = male_df['Weight'].max()
print(mMin_Weight)
print(mMax_Weight)

#Create bins with min and max
mWeight_bins = np.linspace(mMin_Weight,mMax_Weight,10)
mWeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Weight"] = pd.cut(male_df["Weight"], mWeight_bins, labels=mWeight_bin_names)

37.0
214.0


In [19]:
male_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Archery,Volleyball,Modern Pentathlon,Table Tennis,Baseball,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,1,4,2,CHN,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,3,1,CHN,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
26,2,4,1,FIN,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
29,2,5,4,FIN,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
52,1,5,2,NOR,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [20]:
#Find min and max of data types to create bins
female_df['Age'] = female_df['Age'].astype(float, errors = 'raise')

fMin_Age = female_df['Age'].min()
fMax_Age = female_df['Age'].max()
print(fMin_Age)
print(fMax_Age)

#Create bins with min and max
fAge_bins = np.linspace(fMin_Age,fMax_Age,10)
fAge_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Age"] = pd.cut(female_df["Age"], fAge_bins, labels=fAge_bin_names)

11.0
69.0


In [21]:
#Find min and max of data types to create bins
female_df['Height'] = female_df['Height'].astype(float, errors = 'raise')

fMin_Height = female_df['Height'].min()
fMax_Height = female_df['Height'].max()
print(fMin_Height)
print(fMax_Height)

#Create bins with min and max
fHeight_bins = np.linspace(fMin_Height,fMax_Height,10)
fHeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Height"] = pd.cut(female_df["Height"], fHeight_bins, labels=fHeight_bin_names)

127.0
213.0


In [22]:
#Find min and max of data types to create bins
female_df['Weight'] = female_df['Weight'].astype(float, errors = 'raise')

fMin_Weight = female_df['Weight'].min()
fMax_Weight = female_df['Weight'].max()
print(fMin_Weight)
print(fMax_Weight)

#Create bins with min and max
fWeight_bins = np.linspace(fMin_Weight,fMax_Weight,10)
fWeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Weight"] = pd.cut(female_df["Weight"], fWeight_bins, labels=fWeight_bin_names)

25.0
167.0


In [23]:
female_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Volleyball,Synchronized Swimming,Modern Pentathlon,Table Tennis,Rhythmic Gymnastics,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
51,1,4,6,ROU,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
69,1,5,2,NOR,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
81,2,4,2,EST,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
82,2,4,2,EST,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
101,1,3,1,AZE,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Men's Basketball Regression

In [24]:
# Preserve Basketball column
mBasketball_list = male_df['Basketball'].tolist()

In [25]:
# Delete all sport columns
mBasketball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [26]:
# Add basketball column back in
mBasketball_df['Basketball'] = mBasketball_list

mBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball
0,1,4,2,CHN,True
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [27]:
mBasketball_dummies = pd.get_dummies(mBasketball_df)
mBasketball_dummies.head(3)

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [28]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBasketball_dummies.drop(columns=['Basketball'])
y_train = mBasketball_dummies['Basketball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBasketball_dummies.drop(columns=['Basketball'])
y_test = mBasketball_dummies['Basketball']

In [29]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9810693499062839

In [30]:
mBasketball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBasketball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBasketball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBasketball_df)

array([0.33198692, 0.03064336, 0.10720791, ..., 0.0949339 , 0.20147517,
       0.25171826])

Unnamed: 0,Age,Height,Weight,NOC,Basketball
0,1,4,2,CHN,True
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [31]:
mBasketball_pred = y_pred.tolist()

In [32]:
mBasketball_df['Men_Basketball_Prediction'] = mBasketball_pred

In [33]:
mBasketball_df

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Men_Basketball_Prediction
0,1,4,2,CHN,True,0.331987
1,1,3,1,CHN,False,0.030643
26,2,4,1,FIN,False,0.107208
29,2,5,4,FIN,False,0.107040
52,1,5,2,NOR,False,0.002615
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.024002
204023,2,3,1,ARG,False,0.031747
204024,1,4,1,USA,False,0.094934
204025,1,5,1,RUS,False,0.201475


### Men's Judo Regression

In [34]:
# Preserve Judo column
mJudo_list = male_df['Judo'].tolist()

In [35]:
# Delete all sport columns
mJudo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [36]:
# Add Judo column back in
mJudo_df['Judo'] = mJudo_list

mJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Judo
0,1,4,2,CHN,False
1,1,3,1,CHN,True
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [37]:
mJudo_dummies = pd.get_dummies(mJudo_df)
mJudo_dummies.head(3)

Unnamed: 0,Judo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mJudo_dummies.drop(columns=['Judo'])
y_train = mJudo_dummies['Judo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mJudo_dummies.drop(columns=['Judo'])
y_test = mJudo_dummies['Judo']

In [39]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9764525993883792

In [40]:
mJudo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mJudo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mJudo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mJudo_df)

array([0.74964658, 0.64093691, 0.28978635, ..., 0.24286814, 0.1112376 ,
       0.13194037])

Unnamed: 0,Age,Height,Weight,NOC,Judo
0,1,4,2,CHN,False
1,1,3,1,CHN,True
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [41]:
mJudo_pred = y_pred.tolist()

In [42]:
mJudo_df['Men_Judo_Prediction'] = mJudo_pred

In [43]:
mJudo_df

Unnamed: 0,Age,Height,Weight,NOC,Judo,Men_Judo_Prediction
0,1,4,2,CHN,False,0.749647
1,1,3,1,CHN,True,0.640937
26,2,4,1,FIN,False,0.289786
29,2,5,4,FIN,False,0.881562
52,1,5,2,NOR,False,0.152283
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.610823
204023,2,3,1,ARG,False,0.655888
204024,1,4,1,USA,False,0.242868
204025,1,5,1,RUS,False,0.111238


### Men's Badminton Regression

In [44]:
# Preserve Badminton column
mBadminton_list = male_df['Badminton'].tolist()

In [45]:
# Delete all sport columns
mBadminton_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [46]:
# Add badminton column back in
mBadminton_df['Badminton'] = mBadminton_list

mBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Badminton
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,True
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [47]:
mBadminton_dummies = pd.get_dummies(mBadminton_df)
mBadminton_dummies.head(3)

Unnamed: 0,Badminton,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,True,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBadminton_dummies.drop(columns=['Badminton'])
y_train = mBadminton_dummies['Badminton']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBadminton_dummies.drop(columns=['Badminton'])
y_test = mBadminton_dummies['Badminton']

In [49]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9930650093716089

In [50]:
mBadminton_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBadminton_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBadminton_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBadminton_df)

array([0.87686038, 0.69172   , 0.6883327 , ..., 0.36436613, 0.46870165,
       0.64304443])

Unnamed: 0,Age,Height,Weight,NOC,Badminton
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,True
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [51]:
mBadminton_pred = y_pred.tolist()

In [52]:
mBadminton_df['Men_Badminton_Prediction'] = mBadminton_pred

In [53]:
mBadminton_df

Unnamed: 0,Age,Height,Weight,NOC,Badminton,Men_Badminton_Prediction
0,1,4,2,CHN,False,0.876860
1,1,3,1,CHN,False,0.691720
26,2,4,1,FIN,True,0.688333
29,2,5,4,FIN,False,0.003795
52,1,5,2,NOR,False,0.236207
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000595
204023,2,3,1,ARG,False,0.001214
204024,1,4,1,USA,False,0.364366
204025,1,5,1,RUS,False,0.468702


### Men's Athletics Regression

In [54]:
# Preserve Athletics column
mAthletics_list = male_df['Athletics'].tolist()

In [55]:
# Delete all sport columns
mAthletics_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [56]:
# Add Athletics column back in
mAthletics_df['Athletics'] = mAthletics_list

mAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Athletics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,True
52,1,5,2,NOR,False


In [57]:
mAthletics_dummies = pd.get_dummies(mAthletics_df)
mAthletics_dummies.head(3)

Unnamed: 0,Athletics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mAthletics_dummies.drop(columns=['Athletics'])
y_train = mAthletics_dummies['Athletics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mAthletics_dummies.drop(columns=['Athletics'])
y_test = mAthletics_dummies['Athletics']

In [59]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.849422906185262

In [60]:
mAthletics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mAthletics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mAthletics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mAthletics_df)

array([0.17961477, 0.1980378 , 0.70622393, ..., 0.60348781, 0.67395344,
       0.73499425])

Unnamed: 0,Age,Height,Weight,NOC,Athletics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,True
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [61]:
mAthletics_pred = y_pred.tolist()

In [62]:
mAthletics_df['Men_Athletics_Prediction'] = mAthletics_pred

In [63]:
mAthletics_df

Unnamed: 0,Age,Height,Weight,NOC,Athletics,Men_Athletics_Prediction
0,1,4,2,CHN,False,0.179615
1,1,3,1,CHN,False,0.198038
26,2,4,1,FIN,False,0.706224
29,2,5,4,FIN,True,0.925975
52,1,5,2,NOR,False,0.388571
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.109636
204023,2,3,1,ARG,False,0.141793
204024,1,4,1,USA,False,0.603488
204025,1,5,1,RUS,False,0.673953


### Men's Weightlifting Regression

In [64]:
# Preserve Weightlifting column
mWeightlifting_list = male_df['Weightlifting'].tolist()

In [65]:
# Delete all sport columns
mWeightlifting_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [66]:
# Add Weightlifting column back in
mWeightlifting_df['Weightlifting'] = mWeightlifting_list

mWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [67]:
mWeightlifting_dummies = pd.get_dummies(mWeightlifting_df)
mWeightlifting_dummies.head(3)

Unnamed: 0,Weightlifting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWeightlifting_dummies.drop(columns=['Weightlifting'])
y_train = mWeightlifting_dummies['Weightlifting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWeightlifting_dummies.drop(columns=['Weightlifting'])
y_test = mWeightlifting_dummies['Weightlifting']

In [69]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9770444904804183

In [70]:
mWeightlifting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWeightlifting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWeightlifting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWeightlifting_df)

array([0.75146775, 0.69968667, 0.13099002, ..., 0.06395351, 0.0012944 ,
       0.0015249 ])

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [71]:
mWeightlifting_pred = y_pred.tolist()

In [72]:
mWeightlifting_df['Men_Weightlifting_Prediction'] = mWeightlifting_pred

In [73]:
mWeightlifting_df

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting,Men_Weightlifting_Prediction
0,1,4,2,CHN,False,0.751468
1,1,3,1,CHN,False,0.699687
26,2,4,1,FIN,False,0.130990
29,2,5,4,FIN,False,0.851173
52,1,5,2,NOR,False,0.022371
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.148544
204023,2,3,1,ARG,False,0.170520
204024,1,4,1,USA,False,0.063954
204025,1,5,1,RUS,False,0.001294


### Men's Wrestling Regression

In [74]:
# Preserve Wrestling column
mWrestling_list = male_df['Wrestling'].tolist()

In [75]:
# Delete all sport columns
mWrestling_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [76]:
# Add Wrestling column back in
mWrestling_df['Wrestling'] = mWrestling_list

mWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,True


In [77]:
mWrestling_dummies = pd.get_dummies(mWrestling_df)
mWrestling_dummies.head(3)

Unnamed: 0,Wrestling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWrestling_dummies.drop(columns=['Wrestling'])
y_train = mWrestling_dummies['Wrestling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWrestling_dummies.drop(columns=['Wrestling'])
y_test = mWrestling_dummies['Wrestling']

In [79]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9538818190786229

In [80]:
mWrestling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWrestling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWrestling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWrestling_df)

array([0.41812517, 0.44740444, 0.56838723, ..., 0.33945416, 0.0967113 ,
       0.12796833])

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,True
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [81]:
mWrestling_pred = y_pred.tolist()

In [82]:
mWrestling_df['Men_Wrestling_Prediction'] = mWrestling_pred

In [83]:
mWrestling_df

Unnamed: 0,Age,Height,Weight,NOC,Wrestling,Men_Wrestling_Prediction
0,1,4,2,CHN,False,0.418125
1,1,3,1,CHN,False,0.447404
26,2,4,1,FIN,False,0.568387
29,2,5,4,FIN,False,0.969239
52,1,5,2,NOR,True,0.248509
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.508612
204023,2,3,1,ARG,False,0.586549
204024,1,4,1,USA,False,0.339454
204025,1,5,1,RUS,False,0.096711


### Men's Rowing Regression

In [84]:
# Preserve Rowing column
mRowing_list = male_df['Rowing'].tolist()

In [85]:
# Delete all sport columns
mRowing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [86]:
# Add Rowing column back in
mRowing_df['Rowing'] = mRowing_list

mRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rowing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [87]:
mRowing_dummies = pd.get_dummies(mRowing_df)
mRowing_dummies.head(3)

Unnamed: 0,Rowing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mRowing_dummies.drop(columns=['Rowing'])
y_train = mRowing_dummies['Rowing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mRowing_dummies.drop(columns=['Rowing'])
y_test = mRowing_dummies['Rowing']

In [89]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9467495314195521

In [90]:
mRowing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mRowing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mRowing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mRowing_df)

array([0.21879786, 0.02464013, 0.18402164, ..., 0.1880857 , 0.339068  ,
       0.37372527])

Unnamed: 0,Age,Height,Weight,NOC,Rowing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,True


In [91]:
mRowing_pred = y_pred.tolist()

In [92]:
mRowing_df['Men_Rowing_Prediction'] = mRowing_pred

In [93]:
mRowing_df

Unnamed: 0,Age,Height,Weight,NOC,Rowing,Men_Rowing_Prediction
0,1,4,2,CHN,False,0.218798
1,1,3,1,CHN,False,0.024640
26,2,4,1,FIN,False,0.184022
29,2,5,4,FIN,False,0.001198
52,1,5,2,NOR,False,0.833414
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.062718
204023,2,3,1,ARG,False,0.072214
204024,1,4,1,USA,False,0.188086
204025,1,5,1,RUS,True,0.339068


### Men's Swimming Regression

In [94]:
# Preserve Swimming column
mSwimming_list = male_df['Swimming'].tolist()

In [95]:
# Delete all sport columns
mSwimming_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [96]:
# Add Swimming column back in
mSwimming_df['Swimming'] = mSwimming_list

mSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Swimming
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [97]:
mSwimming_dummies = pd.get_dummies(mSwimming_df)
mSwimming_dummies.head(3)

Unnamed: 0,Swimming,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mSwimming_dummies.drop(columns=['Swimming'])
y_train = mSwimming_dummies['Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mSwimming_dummies.drop(columns=['Swimming'])
y_test = mSwimming_dummies['Swimming']

In [99]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9097267436125086

In [100]:
mSwimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mSwimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mSwimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mSwimming_df)

array([0.51270359, 0.15627552, 0.15566816, ..., 0.52033156, 0.80759335,
       0.47916554])

Unnamed: 0,Age,Height,Weight,NOC,Swimming
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [101]:
mSwimming_pred = y_pred.tolist()

In [102]:
mSwimming_df['Men_Swimming_Prediction'] = mSwimming_pred

In [103]:
mSwimming_df

Unnamed: 0,Age,Height,Weight,NOC,Swimming,Men_Swimming_Prediction
0,1,4,2,CHN,False,0.512704
1,1,3,1,CHN,False,0.156276
26,2,4,1,FIN,False,0.155668
29,2,5,4,FIN,False,0.000438
52,1,5,2,NOR,False,0.584758
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.114583
204023,2,3,1,ARG,False,0.027583
204024,1,4,1,USA,False,0.520332
204025,1,5,1,RUS,False,0.807593


### Men's Football Regression

In [104]:
# Preserve Football column
mFootball_list = male_df['Football'].tolist()

In [105]:
# Delete all sport columns
mFootball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [106]:
# Add Football column back in
mFootball_df['Football'] = mFootball_list

mFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Football
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [107]:
mFootball_dummies = pd.get_dummies(mFootball_df)
mFootball_dummies.head(3)

Unnamed: 0,Football,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mFootball_dummies.drop(columns=['Football'])
y_train = mFootball_dummies['Football']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mFootball_dummies.drop(columns=['Football'])
y_test = mFootball_dummies['Football']

In [109]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9686495018249975

In [110]:
mFootball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mFootball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mFootball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mFootball_df)

array([0.54536813, 0.42457145, 0.28704633, ..., 0.62579105, 0.0022136 ,
       0.00088743])

Unnamed: 0,Age,Height,Weight,NOC,Football
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,True
204025,1,5,1,RUS,False


In [111]:
mFootball_pred = y_pred.tolist()

In [112]:
mFootball_df['Men_Football_Prediction'] = mFootball_pred

In [113]:
mFootball_df

Unnamed: 0,Age,Height,Weight,NOC,Football,Men_Football_Prediction
0,1,4,2,CHN,False,0.545368
1,1,3,1,CHN,False,0.424571
26,2,4,1,FIN,False,0.287046
29,2,5,4,FIN,False,0.000570
52,1,5,2,NOR,False,0.453192
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.753327
204023,2,3,1,ARG,False,0.550098
204024,1,4,1,USA,True,0.625791
204025,1,5,1,RUS,False,0.002214


### Men's Equestrianism Regression

In [114]:
# Preserve Equestrianism column
mEquestrianism_list = male_df['Equestrianism'].tolist()

In [115]:
# Delete all sport columns
mEquestrianism_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [116]:
# Add Equestrianism column back in
mEquestrianism_df['Equestrianism'] = mEquestrianism_list

mEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [117]:
mEquestrianism_dummies = pd.get_dummies(mEquestrianism_df)
mEquestrianism_dummies.head(3)

Unnamed: 0,Equestrianism,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mEquestrianism_dummies.drop(columns=['Equestrianism'])
y_train = mEquestrianism_dummies['Equestrianism']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mEquestrianism_dummies.drop(columns=['Equestrianism'])
y_test = mEquestrianism_dummies['Equestrianism']

In [119]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9712439577784354

In [120]:
mEquestrianism_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mEquestrianism_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mEquestrianism_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mEquestrianism_df)

array([0.01843073, 0.04482098, 0.02763123, ..., 0.35227477, 0.05009648,
       0.1372135 ])

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [121]:
mEquestrianism_pred = y_pred.tolist()

In [122]:
mEquestrianism_df['Men_Equestrianism_Prediction'] = mEquestrianism_pred

In [123]:
mEquestrianism_df

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism,Men_Equestrianism_Prediction
0,1,4,2,CHN,False,0.018431
1,1,3,1,CHN,False,0.044821
26,2,4,1,FIN,False,0.027631
29,2,5,4,FIN,False,0.000017
52,1,5,2,NOR,False,0.036798
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.439518
204023,2,3,1,ARG,False,0.702799
204024,1,4,1,USA,False,0.352275
204025,1,5,1,RUS,False,0.050096


### Men's Shooting Regression

In [124]:
# Preserve Shooting column
mShooting_list = male_df['Shooting'].tolist()

In [125]:
# Delete all sport columns
mShooting_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [126]:
# Add Shooting column back in
mShooting_df['Shooting'] = mShooting_list

mShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Shooting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [127]:
mShooting_dummies = pd.get_dummies(mShooting_df)
mShooting_dummies.head(3)

Unnamed: 0,Shooting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mShooting_dummies.drop(columns=['Shooting'])
y_train = mShooting_dummies['Shooting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mShooting_dummies.drop(columns=['Shooting'])
y_test = mShooting_dummies['Shooting']

In [129]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9528756042221564

In [130]:
mShooting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mShooting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mShooting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mShooting_df)

array([0.64513801, 0.42127571, 0.61144173, ..., 0.27083469, 0.09894731,
       0.21909892])

Unnamed: 0,Age,Height,Weight,NOC,Shooting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [131]:
mShooting_pred = y_pred.tolist()

In [132]:
mShooting_df['Men_Shooting_Prediction'] = mShooting_pred

In [133]:
mShooting_df

Unnamed: 0,Age,Height,Weight,NOC,Shooting,Men_Shooting_Prediction
0,1,4,2,CHN,False,0.645138
1,1,3,1,CHN,False,0.421276
26,2,4,1,FIN,False,0.611442
29,2,5,4,FIN,False,0.454390
52,1,5,2,NOR,False,0.454773
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.142830
204023,2,3,1,ARG,False,0.298609
204024,1,4,1,USA,False,0.270835
204025,1,5,1,RUS,False,0.098947


### Men's Gymnastics Regression

In [134]:
# Preserve Gymnastics column
mGymnastics_list = male_df['Gymnastics'].tolist()

In [135]:
# Delete all sport columns
mGymnastics_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [136]:
# Add Gymnastics column back in
mGymnastics_df['Gymnastics'] = mGymnastics_list

mGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [137]:
mGymnastics_dummies = pd.get_dummies(mGymnastics_df)
mGymnastics_dummies.head(3)

Unnamed: 0,Gymnastics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mGymnastics_dummies.drop(columns=['Gymnastics'])
y_train = mGymnastics_dummies['Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mGymnastics_dummies.drop(columns=['Gymnastics'])
y_test = mGymnastics_dummies['Gymnastics']

In [139]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9194534872250173

In [140]:
mGymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mGymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mGymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mGymnastics_df)

array([0.07111118, 0.90003492, 0.6215189 , ..., 0.61233831, 0.13017348,
       0.08022574])

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [141]:
mGymnastics_pred = y_pred.tolist()

In [142]:
mGymnastics_df['Men_Gymnastics_Prediction'] = mGymnastics_pred

In [143]:
mGymnastics_df

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics,Men_Gymnastics_Prediction
0,1,4,2,CHN,False,0.071111
1,1,3,1,CHN,False,0.900035
26,2,4,1,FIN,False,0.621519
29,2,5,4,FIN,False,0.001904
52,1,5,2,NOR,False,0.012862
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.497567
204023,2,3,1,ARG,False,0.365960
204024,1,4,1,USA,False,0.612338
204025,1,5,1,RUS,False,0.130173


### Men's Taekwondo Regression

In [144]:
# Preserve Taekwondo column
mTaekwondo_list = male_df['Taekwondo'].tolist()

In [145]:
# Delete all sport columns
mTaekwondo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [146]:
# Add Taekwondo column back in
mTaekwondo_df['Taekwondo'] = mTaekwondo_list

mTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [147]:
mTaekwondo_dummies = pd.get_dummies(mTaekwondo_df)
mTaekwondo_dummies.head(3)

Unnamed: 0,Taekwondo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTaekwondo_dummies.drop(columns=['Taekwondo'])
y_train = mTaekwondo_dummies['Taekwondo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTaekwondo_dummies.drop(columns=['Taekwondo'])
y_test = mTaekwondo_dummies['Taekwondo']

In [149]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9969813554306007

In [150]:
mTaekwondo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTaekwondo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTaekwondo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTaekwondo_df)

array([0.58502638, 0.47837475, 0.23021405, ..., 0.32079041, 0.6562277 ,
       0.70787878])

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [151]:
mTaekwondo_pred = y_pred.tolist()

In [152]:
mTaekwondo_df['Men_Taekwondo_Prediction'] = mTaekwondo_pred

In [153]:
mTaekwondo_df

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo,Men_Taekwondo_Prediction
0,1,4,2,CHN,False,0.585026
1,1,3,1,CHN,False,0.478375
26,2,4,1,FIN,False,0.230214
29,2,5,4,FIN,False,0.000463
52,1,5,2,NOR,False,0.003245
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.321674
204023,2,3,1,ARG,False,0.375776
204024,1,4,1,USA,False,0.320790
204025,1,5,1,RUS,False,0.656228


### Men's Boxing Regression

In [154]:
# Preserve Boxing column
mBoxing_list = male_df['Boxing'].tolist()

In [155]:
# Delete all sport columns
mBoxing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [156]:
# Add Boxing column back in
mBoxing_df['Boxing'] = mBoxing_list

mBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Boxing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [157]:
mBoxing_dummies = pd.get_dummies(mBoxing_df)
mBoxing_dummies.head(3)

Unnamed: 0,Boxing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [158]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBoxing_dummies.drop(columns=['Boxing'])
y_train = mBoxing_dummies['Boxing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBoxing_dummies.drop(columns=['Boxing'])
y_test = mBoxing_dummies['Boxing']

In [159]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9596922166321397

In [160]:
mBoxing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBoxing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBoxing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBoxing_df)

array([0.2630388 , 0.38401185, 0.40501808, ..., 0.52740096, 0.49819101,
       0.34306898])

Unnamed: 0,Age,Height,Weight,NOC,Boxing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [161]:
mBoxing_pred = y_pred.tolist()

In [162]:
mBoxing_df['Men_Boxing_Prediction'] = mBoxing_pred

In [163]:
mBoxing_df

Unnamed: 0,Age,Height,Weight,NOC,Boxing,Men_Boxing_Prediction
0,1,4,2,CHN,False,0.263039
1,1,3,1,CHN,False,0.384012
26,2,4,1,FIN,False,0.405018
29,2,5,4,FIN,False,0.056806
52,1,5,2,NOR,False,0.237949
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.640187
204023,2,3,1,ARG,False,0.483448
204024,1,4,1,USA,False,0.527401
204025,1,5,1,RUS,False,0.498191


### Men's Fencing Regression

In [164]:
# Preserve Fencing column
mFencing_list = male_df['Fencing'].tolist()

In [165]:
# Delete all sport columns
mFencing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [166]:
# Add Fencing column back in
mFencing_df['Fencing'] = mFencing_list

mFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Fencing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [167]:
mFencing_dummies = pd.get_dummies(mFencing_df)
mFencing_dummies.head(3)

Unnamed: 0,Fencing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [168]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mFencing_dummies.drop(columns=['Fencing'])
y_train = mFencing_dummies['Fencing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mFencing_dummies.drop(columns=['Fencing'])
y_test = mFencing_dummies['Fencing']

In [169]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9571372200848377

In [170]:
mFencing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mFencing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mFencing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mFencing_df)

array([0.59807748, 0.36641213, 0.40014296, ..., 0.53259899, 0.59110562,
       0.70441725])

Unnamed: 0,Age,Height,Weight,NOC,Fencing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [171]:
mFencing_pred = y_pred.tolist()

In [172]:
mFencing_df['Men_Fencing_Prediction'] = mFencing_pred

In [173]:
mFencing_df

Unnamed: 0,Age,Height,Weight,NOC,Fencing,Men_Fencing_Prediction
0,1,4,2,CHN,False,0.598077
1,1,3,1,CHN,False,0.366412
26,2,4,1,FIN,False,0.400143
29,2,5,4,FIN,False,0.000826
52,1,5,2,NOR,False,0.466026
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.384037
204023,2,3,1,ARG,False,0.506858
204024,1,4,1,USA,False,0.532599
204025,1,5,1,RUS,False,0.591106


### Men's Diving Regression

In [174]:
# Preserve Diving column
mDiving_list = male_df['Diving'].tolist()

In [175]:
# Delete all sport columns
mDiving_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [176]:
# Add Diving column back in
mDiving_df['Diving'] = mDiving_list

mDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Diving
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [177]:
mDiving_dummies = pd.get_dummies(mDiving_df)
mDiving_dummies.head(3)

Unnamed: 0,Diving,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mDiving_dummies.drop(columns=['Diving'])
y_train = mDiving_dummies['Diving']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mDiving_dummies.drop(columns=['Diving'])
y_test = mDiving_dummies['Diving']

In [179]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9899575811384038

In [180]:
mDiving_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mDiving_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mDiving_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mDiving_df)

array([0.47884689, 0.82131585, 0.56793348, ..., 0.83731579, 0.44058565,
       0.34360331])

Unnamed: 0,Age,Height,Weight,NOC,Diving
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [181]:
mDiving_pred = y_pred.tolist()

In [182]:
mDiving_df['Men_Diving_Prediction'] = mDiving_pred

In [183]:
mDiving_df

Unnamed: 0,Age,Height,Weight,NOC,Diving,Men_Diving_Prediction
0,1,4,2,CHN,False,0.478847
1,1,3,1,CHN,False,0.821316
26,2,4,1,FIN,False,0.567933
29,2,5,4,FIN,False,0.001273
52,1,5,2,NOR,False,0.119797
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.004641
204023,2,3,1,ARG,False,0.003090
204024,1,4,1,USA,False,0.837316
204025,1,5,1,RUS,False,0.440586


### Men's Canoeing Regression

In [184]:
# Preserve Canoeing column
mCanoeing_list = male_df['Canoeing'].tolist()

In [185]:
# Delete all sport columns
mCanoeing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [186]:
# Add Canoeing column back in
mCanoeing_df['Canoeing'] = mCanoeing_list

mCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [187]:
mCanoeing_dummies = pd.get_dummies(mCanoeing_df)
mCanoeing_dummies.head(3)

Unnamed: 0,Canoeing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [188]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mCanoeing_dummies.drop(columns=['Canoeing'])
y_train = mCanoeing_dummies['Canoeing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mCanoeing_dummies.drop(columns=['Canoeing'])
y_test = mCanoeing_dummies['Canoeing']

In [189]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9589523527670909

In [190]:
mCanoeing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mCanoeing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mCanoeing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mCanoeing_df)

array([0.66745338, 0.23774355, 0.53699233, ..., 0.42917575, 0.39897097,
       0.40518823])

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [191]:
mCanoeing_pred = y_pred.tolist()

In [192]:
mCanoeing_df['Men_Canoeing_Prediction'] = mCanoeing_pred

In [193]:
mCanoeing_df

Unnamed: 0,Age,Height,Weight,NOC,Canoeing,Men_Canoeing_Prediction
0,1,4,2,CHN,False,0.667453
1,1,3,1,CHN,False,0.237744
26,2,4,1,FIN,False,0.536992
29,2,5,4,FIN,False,0.002569
52,1,5,2,NOR,False,0.792687
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.204548
204023,2,3,1,ARG,False,0.208788
204024,1,4,1,USA,False,0.429176
204025,1,5,1,RUS,False,0.398971


### Men's Handball Regression

In [194]:
# Preserve Handball column
mHandball_list = male_df['Handball'].tolist()

In [195]:
# Delete all sport columns
mHandball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [196]:
# Add Handball column back in
mHandball_df['Handball'] = mHandball_list

mHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Handball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [197]:
mHandball_dummies = pd.get_dummies(mHandball_df)
mHandball_dummies.head(3)

Unnamed: 0,Handball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [198]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mHandball_dummies.drop(columns=['Handball'])
y_train = mHandball_dummies['Handball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mHandball_dummies.drop(columns=['Handball'])
y_test = mHandball_dummies['Handball']

In [199]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9800434053467495

In [200]:
mHandball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mHandball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mHandball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mHandball_df)

array([0.25973194, 0.00685944, 0.00041046, ..., 0.06145538, 0.20497702,
       0.32077998])

Unnamed: 0,Age,Height,Weight,NOC,Handball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [201]:
mHandball_pred = y_pred.tolist()

In [202]:
mHandball_df['Men_Handball_Prediction'] = mHandball_pred

In [203]:
mHandball_df

Unnamed: 0,Age,Height,Weight,NOC,Handball,Men_Handball_Prediction
0,1,4,2,CHN,False,0.259732
1,1,3,1,CHN,False,0.006859
26,2,4,1,FIN,False,0.000410
29,2,5,4,FIN,False,0.000826
52,1,5,2,NOR,False,0.658578
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.014632
204023,2,3,1,ARG,False,0.026480
204024,1,4,1,USA,False,0.061455
204025,1,5,1,RUS,False,0.204977


### Men's Water Polo Regression

In [204]:
# Preserve Water Polo column
mWater_Polo_list = male_df['Water Polo'].tolist()

In [205]:
# Delete all sport columns
mWater_Polo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [206]:
# Add Water Polo column back in
mWater_Polo_df['Water Polo'] = mWater_Polo_list

mWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [207]:
mWater_Polo_dummies = pd.get_dummies(mWater_Polo_df)
mWater_Polo_dummies.head(3)

Unnamed: 0,Water Polo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [208]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWater_Polo_dummies.drop(columns=['Water Polo'])
y_train = mWater_Polo_dummies['Water Polo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWater_Polo_dummies.drop(columns=['Water Polo'])
y_test = mWater_Polo_dummies['Water Polo']

In [209]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9787905692019335

In [210]:
mWater_Polo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWater_Polo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWater_Polo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWater_Polo_df)

array([0.61218568, 0.04579336, 0.00069349, ..., 0.29635312, 0.25173759,
       0.28076883])

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [211]:
mWater_Polo_pred = y_pred.tolist()

In [212]:
mWater_Polo_df['Men_Water_Polo_Prediction'] = mWater_Polo_pred

In [213]:
mWater_Polo_df

Unnamed: 0,Age,Height,Weight,NOC,Water Polo,Men_Water_Polo_Prediction
0,1,4,2,CHN,False,0.612186
1,1,3,1,CHN,False,0.045793
26,2,4,1,FIN,False,0.000693
29,2,5,4,FIN,False,0.000589
52,1,5,2,NOR,False,0.004496
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.010380
204023,2,3,1,ARG,False,0.012025
204024,1,4,1,USA,False,0.296353
204025,1,5,1,RUS,False,0.251738


### Men's Tennis Regression

In [214]:
# Preserve Tennis column
mTennis_list = male_df['Tennis'].tolist()

In [215]:
# Delete all sport columns
mTennis_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [216]:
# Add Tennis column back in
mTennis_df['Tennis'] = mTennis_list

mTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [217]:
mTennis_dummies = pd.get_dummies(mTennis_df)
mTennis_dummies.head(3)

Unnamed: 0,Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [218]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTennis_dummies.drop(columns=['Tennis'])
y_train = mTennis_dummies['Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTennis_dummies.drop(columns=['Tennis'])
y_test = mTennis_dummies['Tennis']

In [219]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.989868797474598

In [220]:
mTennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTennis_df)

array([0.43943872, 0.08335994, 0.16213108, ..., 0.32590021, 0.76617216,
       0.83543247])

Unnamed: 0,Age,Height,Weight,NOC,Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [221]:
mTennis_pred = y_pred.tolist()

In [222]:
mTennis_df['Men_Tennis_Prediction'] = mTennis_pred

In [223]:
mTennis_df

Unnamed: 0,Age,Height,Weight,NOC,Tennis,Men_Tennis_Prediction
0,1,4,2,CHN,False,0.439439
1,1,3,1,CHN,False,0.083360
26,2,4,1,FIN,False,0.162131
29,2,5,4,FIN,False,0.000448
52,1,5,2,NOR,False,0.517772
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.283421
204023,2,3,1,ARG,False,0.379953
204024,1,4,1,USA,False,0.325900
204025,1,5,1,RUS,False,0.766172


### Men's Cycling Regression

In [224]:
# Preserve Cycling column
mCycling_list = male_df['Cycling'].tolist()

In [225]:
# Delete all sport columns
mCycling_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [226]:
# Add Cycling column back in
mCycling_df['Cycling'] = mCycling_list

mCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Cycling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [227]:
mCycling_dummies = pd.get_dummies(mCycling_df)
mCycling_dummies.head(3)

Unnamed: 0,Cycling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [228]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mCycling_dummies.drop(columns=['Cycling'])
y_train = mCycling_dummies['Cycling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mCycling_dummies.drop(columns=['Cycling'])
y_test = mCycling_dummies['Cycling']

In [229]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9384433264279373

In [230]:
mCycling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mCycling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mCycling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mCycling_df)

array([0.35548157, 0.31268527, 0.57315255, ..., 0.63774788, 0.55875715,
       0.51465461])

Unnamed: 0,Age,Height,Weight,NOC,Cycling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [231]:
mCycling_pred = y_pred.tolist()

In [232]:
mCycling_df['Men_Cycling_Prediction'] = mCycling_pred

In [233]:
mCycling_df

Unnamed: 0,Age,Height,Weight,NOC,Cycling,Men_Cycling_Prediction
0,1,4,2,CHN,False,0.355482
1,1,3,1,CHN,False,0.312685
26,2,4,1,FIN,False,0.573153
29,2,5,4,FIN,False,0.001576
52,1,5,2,NOR,False,0.597105
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.600771
204023,2,3,1,ARG,False,0.557543
204024,1,4,1,USA,False,0.637748
204025,1,5,1,RUS,False,0.558757


### Men's Hockey Regression

In [234]:
# Preserve Hockey column
mHockey_list = male_df['Hockey'].tolist()

In [235]:
# Delete all sport columns
mHockey_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [236]:
# Add Hockey column back in
mHockey_df['Hockey'] = mHockey_list

mHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Hockey
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [237]:
mHockey_dummies = pd.get_dummies(mHockey_df)
mHockey_dummies.head(3)

Unnamed: 0,Hockey,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [238]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mHockey_dummies.drop(columns=['Hockey'])
y_train = mHockey_dummies['Hockey']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mHockey_dummies.drop(columns=['Hockey'])
y_test = mHockey_dummies['Hockey']

In [239]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9731577389760284

In [240]:
mHockey_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mHockey_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mHockey_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mHockey_df)

array([0.26927975, 0.21895873, 0.00281652, ..., 0.24692255, 0.00075184,
       0.00104668])

Unnamed: 0,Age,Height,Weight,NOC,Hockey
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,True
204023,2,3,1,ARG,True
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [241]:
mHockey_pred = y_pred.tolist()

In [242]:
mHockey_df['Men_Hockey_Prediction'] = mHockey_pred

In [243]:
mHockey_df

Unnamed: 0,Age,Height,Weight,NOC,Hockey,Men_Hockey_Prediction
0,1,4,2,CHN,False,0.269280
1,1,3,1,CHN,False,0.218959
26,2,4,1,FIN,False,0.002817
29,2,5,4,FIN,False,0.000004
52,1,5,2,NOR,False,0.001552
...,...,...,...,...,...,...
204022,1,3,1,ARG,True,0.830423
204023,2,3,1,ARG,True,0.872114
204024,1,4,1,USA,False,0.246923
204025,1,5,1,RUS,False,0.000752


### Men's Archery Regression

In [244]:
# Preserve Archery column
mArchery_list = male_df['Archery'].tolist()

In [245]:
# Delete all sport columns
mArchery_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [246]:
# Add Archery column back in
mArchery_df['Archery'] = mArchery_list

mArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Archery
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [247]:
mArchery_dummies = pd.get_dummies(mArchery_df)
mArchery_dummies.head(3)

Unnamed: 0,Archery,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [248]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mArchery_dummies.drop(columns=['Archery'])
y_train = mArchery_dummies['Archery']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mArchery_dummies.drop(columns=['Archery'])
y_test = mArchery_dummies['Archery']

In [249]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9902633915359574

In [250]:
mArchery_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mArchery_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mArchery_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mArchery_df)

array([0.8650016 , 0.66345755, 0.7036902 , ..., 0.54468297, 0.36700646,
       0.34959794])

Unnamed: 0,Age,Height,Weight,NOC,Archery
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [251]:
mArchery_pred = y_pred.tolist()

In [252]:
mArchery_df['Men_Archery_Prediction'] = mArchery_pred

In [253]:
mArchery_df

Unnamed: 0,Age,Height,Weight,NOC,Archery,Men_Archery_Prediction
0,1,4,2,CHN,False,0.865002
1,1,3,1,CHN,False,0.663458
26,2,4,1,FIN,False,0.703690
29,2,5,4,FIN,False,0.527974
52,1,5,2,NOR,False,0.620557
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000561
204023,2,3,1,ARG,False,0.000520
204024,1,4,1,USA,False,0.544683
204025,1,5,1,RUS,False,0.367006


### Men's Volleyball Regression

In [254]:
# Preserve Volleyball column
mVolleyball_list = male_df['Volleyball'].tolist()

In [255]:
# Delete all sport columns
mVolleyball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [256]:
# Add Volleyball column back in
mVolleyball_df['Volleyball'] = mVolleyball_list

mVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [257]:
mVolleyball_dummies = pd.get_dummies(mVolleyball_df)
mVolleyball_dummies.head(3)

Unnamed: 0,Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [258]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mVolleyball_dummies.drop(columns=['Volleyball'])
y_train = mVolleyball_dummies['Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mVolleyball_dummies.drop(columns=['Volleyball'])
y_test = mVolleyball_dummies['Volleyball']

In [259]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.982874617737003

In [260]:
mVolleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mVolleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mVolleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mVolleyball_df)

array([1.25680449e-01, 3.42832807e-03, 3.78002922e-04, ...,
       1.00171438e-01, 3.90199527e-01, 4.92850835e-01])

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [261]:
mVolleyball_pred = y_pred.tolist()

In [262]:
mVolleyball_df['Men_Volleyball_Prediction'] = mVolleyball_pred

In [263]:
mVolleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Volleyball,Men_Volleyball_Prediction
0,1,4,2,CHN,False,0.125680
1,1,3,1,CHN,False,0.003428
26,2,4,1,FIN,False,0.000378
29,2,5,4,FIN,False,0.000031
52,1,5,2,NOR,False,0.002509
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.016319
204023,2,3,1,ARG,False,0.024576
204024,1,4,1,USA,False,0.100171
204025,1,5,1,RUS,False,0.390200


### Men's Modern Pentathlon Regression

In [264]:
# Preserve Modern Pentathlon column
mModern_Pentathlon_list = male_df['Modern Pentathlon'].tolist()

In [265]:
# Delete all sport columns
mModern_Pentathlon_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [266]:
# Add Modern Pentathlon column back in
mModern_Pentathlon_df['Modern Pentathlon'] = mModern_Pentathlon_list

mModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [267]:
mModern_Pentathlon_dummies = pd.get_dummies(mModern_Pentathlon_df)
mModern_Pentathlon_dummies.head(3)

Unnamed: 0,Modern Pentathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [268]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_train = mModern_Pentathlon_dummies['Modern Pentathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_test = mModern_Pentathlon_dummies['Modern Pentathlon']

In [269]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9897109598500543

In [270]:
mModern_Pentathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mModern_Pentathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mModern_Pentathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mModern_Pentathlon_df)

array([0.22673307, 0.20507936, 0.87595511, ..., 0.68565951, 0.58949481,
       0.69232751])

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [271]:
mModern_Pentathlon_pred = y_pred.tolist()

In [272]:
mModern_Pentathlon_df['Men_Modern_Pentathlon_Prediction'] = mModern_Pentathlon_pred

In [273]:
mModern_Pentathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon,Men_Modern_Pentathlon_Prediction
0,1,4,2,CHN,False,0.226733
1,1,3,1,CHN,False,0.205079
26,2,4,1,FIN,False,0.875955
29,2,5,4,FIN,False,0.005958
52,1,5,2,NOR,False,0.001289
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.195266
204023,2,3,1,ARG,False,0.275478
204024,1,4,1,USA,False,0.685660
204025,1,5,1,RUS,False,0.589495


### Men's Table Tennis Regression

In [274]:
# Preserve Table Tennis column
mTable_Tennis_list = male_df['Table Tennis'].tolist()

In [275]:
# Delete all sport columns
mTable_Tennis_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [276]:
# Add Table Tennis column back in
mTable_Tennis_df['Table Tennis'] = mTable_Tennis_list

mTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [277]:
mTable_Tennis_dummies = pd.get_dummies(mTable_Tennis_df)
mTable_Tennis_dummies.head(3)

Unnamed: 0,Table Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [278]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_train = mTable_Tennis_dummies['Table Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_test = mTable_Tennis_dummies['Table Tennis']

In [279]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9908552826279965

In [280]:
mTable_Tennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTable_Tennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTable_Tennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTable_Tennis_df)

array([0.73515326, 0.80470779, 0.14395509, ..., 0.46375361, 0.57717112,
       0.65091417])

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [281]:
mTable_Tennis_pred = y_pred.tolist()

In [282]:
mTable_Tennis_df['Men_Table_Tennis_Prediction'] = mTable_Tennis_pred

In [283]:
mTable_Tennis_df

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis,Men_Table_Tennis_Prediction
0,1,4,2,CHN,False,0.735153
1,1,3,1,CHN,False,0.804708
26,2,4,1,FIN,False,0.143955
29,2,5,4,FIN,False,0.000177
52,1,5,2,NOR,False,0.001432
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.433153
204023,2,3,1,ARG,False,0.510721
204024,1,4,1,USA,False,0.463754
204025,1,5,1,RUS,False,0.577171


### Men's Baseball Regression

In [284]:
# Preserve Baseball column
mBaseball_list = male_df['Baseball'].tolist()

In [285]:
# Delete all sport columns
mBaseball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBaseball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [286]:
# Add Baseball column back in
mBaseball_df['Baseball'] = mBaseball_list

mBaseball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Baseball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [287]:
mBaseball_dummies = pd.get_dummies(mBaseball_df)
mBaseball_dummies.head(3)

Unnamed: 0,Baseball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [288]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBaseball_dummies.drop(columns=['Baseball'])
y_train = mBaseball_dummies['Baseball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBaseball_dummies.drop(columns=['Baseball'])
y_test = mBaseball_dummies['Baseball']

In [289]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9916543356022491

In [290]:
mBaseball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBaseball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBaseball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBaseball_df)

array([9.42785274e-01, 3.86659386e-01, 1.65080361e-04, ...,
       3.40196434e-01, 8.16174235e-05, 1.07755400e-04])

Unnamed: 0,Age,Height,Weight,NOC,Baseball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [291]:
mBaseball_pred = y_pred.tolist()

In [292]:
mBaseball_df['Men_Baseball_Prediction'] = mBaseball_pred

In [293]:
mBaseball_df

Unnamed: 0,Age,Height,Weight,NOC,Baseball,Men_Baseball_Prediction
0,1,4,2,CHN,False,0.942785
1,1,3,1,CHN,False,0.386659
26,2,4,1,FIN,False,0.000165
29,2,5,4,FIN,False,0.000061
52,1,5,2,NOR,False,0.000719
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000042
204023,2,3,1,ARG,False,0.000056
204024,1,4,1,USA,False,0.340196
204025,1,5,1,RUS,False,0.000082


### Men's Rugby Sevens Regression

In [294]:
# Preserve Rugby Sevens column
mRugby_Sevens_list = male_df['Rugby Sevens'].tolist()

In [295]:
# Delete all sport columns
mRugby_Sevens_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [296]:
# Add Rugby Sevens column back in
mRugby_Sevens_df['Rugby Sevens'] = mRugby_Sevens_list

mRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [297]:
mRugby_Sevens_dummies = pd.get_dummies(mRugby_Sevens_df)
mRugby_Sevens_dummies.head(3)

Unnamed: 0,Rugby Sevens,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [298]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_train = mRugby_Sevens_dummies['Rugby Sevens']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_test = mRugby_Sevens_dummies['Rugby Sevens']

In [299]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9985104074183684

In [300]:
mRugby_Sevens_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mRugby_Sevens_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mRugby_Sevens_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mRugby_Sevens_df)

array([7.25119807e-04, 3.55164362e-05, 3.63402634e-05, ...,
       1.32573427e-01, 1.04831544e-05, 1.51683148e-05])

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [301]:
mRugby_Sevens_pred = y_pred.tolist()

In [302]:
mRugby_Sevens_df['Men_Rugby_Sevens_Prediction'] = mRugby_Sevens_pred

In [303]:
mRugby_Sevens_df

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens,Men_Rugby_Sevens_Prediction
0,1,4,2,CHN,False,7.251198e-04
1,1,3,1,CHN,False,3.551644e-05
26,2,4,1,FIN,False,3.634026e-05
29,2,5,4,FIN,False,8.462689e-07
52,1,5,2,NOR,False,3.027843e-04
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,5.048101e-01
204023,2,3,1,ARG,False,5.959661e-01
204024,1,4,1,USA,False,1.325734e-01
204025,1,5,1,RUS,False,1.048315e-05


### Men's Trampolining Regression

In [304]:
# Preserve Trampolining column
mTrampolining_list = male_df['Trampolining'].tolist()

In [305]:
# Delete all sport columns
mTrampolining_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [306]:
# Add Trampolining column back in
mTrampolining_df['Trampolining'] = mTrampolining_list

mTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [307]:
mTrampolining_dummies = pd.get_dummies(mTrampolining_df)
mTrampolining_dummies.head(3)

Unnamed: 0,Trampolining,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [308]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTrampolining_dummies.drop(columns=['Trampolining'])
y_train = mTrampolining_dummies['Trampolining']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTrampolining_dummies.drop(columns=['Trampolining'])
y_test = mTrampolining_dummies['Trampolining']

In [309]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9992700009864851

In [310]:
mTrampolining_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTrampolining_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTrampolining_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTrampolining_df)

array([4.10536604e-01, 9.34655564e-01, 5.84561190e-04, ...,
       5.37788807e-01, 5.65163429e-01, 4.99204665e-01])

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [311]:
mTrampolining_pred = y_pred.tolist()

In [312]:
mTrampolining_df['Men_Trampolining_Prediction'] = mTrampolining_pred

In [313]:
mTrampolining_df

Unnamed: 0,Age,Height,Weight,NOC,Trampolining,Men_Trampolining_Prediction
0,1,4,2,CHN,False,4.105366e-01
1,1,3,1,CHN,False,9.346556e-01
26,2,4,1,FIN,False,5.845612e-04
29,2,5,4,FIN,False,2.752254e-07
52,1,5,2,NOR,False,1.802931e-05
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,4.084625e-03
204023,2,3,1,ARG,False,3.135712e-03
204024,1,4,1,USA,False,5.377888e-01
204025,1,5,1,RUS,False,5.651634e-01


### Men's Beach Volleyball Regression

In [314]:
# Preserve Beach Volleyball column
mBeach_Volleyball_list = male_df['Beach Volleyball'].tolist()

In [315]:
# Delete all sport columns
mBeach_Volleyball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [316]:
# Add Beach Volleyball column back in
mBeach_Volleyball_df['Beach Volleyball'] = mBeach_Volleyball_list

mBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [317]:
mBeach_Volleyball_dummies = pd.get_dummies(mBeach_Volleyball_df)
mBeach_Volleyball_dummies.head(3)

Unnamed: 0,Beach Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [318]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_train = mBeach_Volleyball_dummies['Beach Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_test = mBeach_Volleyball_dummies['Beach Volleyball']

In [319]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9973068955312222

In [320]:
mBeach_Volleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBeach_Volleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBeach_Volleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBeach_Volleyball_df)

array([1.74908611e-02, 6.48777568e-06, 3.91647267e-05, ...,
       1.58516495e-03, 9.18497905e-02, 4.24734191e-01])

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [321]:
mBeach_Volleyball_pred = y_pred.tolist()

In [322]:
mBeach_Volleyball_df['Men_Beach_Volleyball_Prediction'] = mBeach_Volleyball_pred

In [323]:
mBeach_Volleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball,Men_Beach_Volleyball_Prediction
0,1,4,2,CHN,False,0.017491
1,1,3,1,CHN,False,0.000006
26,2,4,1,FIN,False,0.000039
29,2,5,4,FIN,False,0.000002
52,1,5,2,NOR,False,0.552501
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000021
204023,2,3,1,ARG,False,0.000157
204024,1,4,1,USA,False,0.001585
204025,1,5,1,RUS,False,0.091850


### Men's Triathlon Regression

In [324]:
# Preserve Triathlon column
mTriathlon_list = male_df['Triathlon'].tolist()

In [325]:
# Delete all sport columns
mTriathlon_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [326]:
# Add Triathlon column back in
mTriathlon_df['Triathlon'] = mTriathlon_list

mTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [327]:
mTriathlon_dummies = pd.get_dummies(mTriathlon_df)
mTriathlon_dummies.head(3)

Unnamed: 0,Triathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [328]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTriathlon_dummies.drop(columns=['Triathlon'])
y_train = mTriathlon_dummies['Triathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTriathlon_dummies.drop(columns=['Triathlon'])
y_test = mTriathlon_dummies['Triathlon']

In [329]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9973858143434942

In [330]:
mTriathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTriathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTriathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTriathlon_df)

array([0.02313036, 0.09055459, 0.00452946, ..., 0.51617776, 0.93498796,
       0.98220192])

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [331]:
mTriathlon_pred = y_pred.tolist()

In [332]:
mTriathlon_df['Men_Triathlon_Prediction'] = mTriathlon_pred

In [333]:
mTriathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Triathlon,Men_Triathlon_Prediction
0,1,4,2,CHN,False,0.023130
1,1,3,1,CHN,False,0.090555
26,2,4,1,FIN,False,0.004529
29,2,5,4,FIN,False,0.000003
52,1,5,2,NOR,False,0.000203
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.130908
204023,2,3,1,ARG,False,0.366281
204024,1,4,1,USA,False,0.516178
204025,1,5,1,RUS,False,0.934988


### Men's Golf Regression

In [334]:
# Preserve Golf column
mGolf_list = male_df['Golf'].tolist()

In [335]:
# Delete all sport columns
mGolf_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [336]:
# Add Golf column back in
mGolf_df['Golf'] = mGolf_list

mGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Golf
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [337]:
mGolf_dummies = pd.get_dummies(mGolf_df)
mGolf_dummies.head(3)

Unnamed: 0,Golf,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [338]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mGolf_dummies.drop(columns=['Golf'])
y_train = mGolf_dummies['Golf']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mGolf_dummies.drop(columns=['Golf'])
y_test = mGolf_dummies['Golf']

In [339]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9994771628686988

In [340]:
mGolf_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mGolf_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mGolf_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mGolf_df)

array([7.90489211e-01, 5.35731675e-01, 4.20546437e-01, ...,
       2.10480668e-01, 2.65078410e-04, 1.10592321e-03])

Unnamed: 0,Age,Height,Weight,NOC,Golf
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [341]:
mGolf_pred = y_pred.tolist()

In [342]:
mGolf_df['Men_Golf_Prediction'] = mGolf_pred

In [343]:
mGolf_df

Unnamed: 0,Age,Height,Weight,NOC,Golf,Men_Golf_Prediction
0,1,4,2,CHN,False,0.790489
1,1,3,1,CHN,False,0.535732
26,2,4,1,FIN,False,0.420546
29,2,5,4,FIN,False,0.002230
52,1,5,2,NOR,False,0.627388
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.312932
204023,2,3,1,ARG,False,0.655387
204024,1,4,1,USA,False,0.210481
204025,1,5,1,RUS,False,0.000265


### Women's Basketball Regression

In [344]:
# Preserve Basketball column
fBasketball_list = female_df['Basketball'].tolist()

In [345]:
# Delete all sport columns
fBasketball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [346]:
# Add basketball column back in
fBasketball_df['Basketball'] = fBasketball_list

fBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [347]:
fBasketball_dummies = pd.get_dummies(fBasketball_df)
fBasketball_dummies.head(3)

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [348]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBasketball_dummies.drop(columns=['Basketball'])
y_train = fBasketball_dummies['Basketball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBasketball_dummies.drop(columns=['Basketball'])
y_test = fBasketball_dummies['Basketball']

In [349]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9788800997545104

In [350]:
fBasketball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBasketball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBasketball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBasketball_df)

array([5.11636932e-06, 7.72450653e-03, 6.42400035e-03, ...,
       2.43759365e-01, 2.43759365e-01, 5.87893730e-01])

Unnamed: 0,Age,Height,Weight,NOC,Basketball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [351]:
fBasketball_pred = y_pred.tolist()

In [352]:
fBasketball_df['Women_Basketball_Prediction'] = fBasketball_pred

In [353]:
fBasketball_df

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Women_Basketball_Prediction
51,1,4,6,ROU,False,0.000005
69,1,5,2,NOR,False,0.007725
81,2,4,2,EST,False,0.006424
82,2,4,2,EST,False,0.006424
101,1,3,1,AZE,False,0.000180
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.625551
204017,3,4,2,GRC,False,0.433796
204019,1,4,2,RUS,False,0.243759
204020,1,4,2,RUS,False,0.243759


### Women's Judo Regression

In [354]:
# Preserve Judo column
fJudo_list = female_df['Judo'].tolist()

In [355]:
# Delete all sport columns
fJudo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [356]:
# Add Judo column back in
fJudo_df['Judo'] = fJudo_list

fJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Judo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [357]:
fJudo_dummies = pd.get_dummies(fJudo_df)
fJudo_dummies.head(3)

Unnamed: 0,Judo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [358]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fJudo_dummies.drop(columns=['Judo'])
y_train = fJudo_dummies['Judo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fJudo_dummies.drop(columns=['Judo'])
y_test = fJudo_dummies['Judo']

In [359]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9808089467326501

In [360]:
fJudo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fJudo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fJudo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fJudo_df)

array([0.99473576, 0.00113987, 0.03388727, ..., 0.34325817, 0.34325817,
       0.17772081])

Unnamed: 0,Age,Height,Weight,NOC,Judo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [361]:
fJudo_pred = y_pred.tolist()

In [362]:
fJudo_df['Women_Judo_Prediction'] = fJudo_pred

In [363]:
fJudo_df

Unnamed: 0,Age,Height,Weight,NOC,Judo,Women_Judo_Prediction
51,1,4,6,ROU,False,0.994736
69,1,5,2,NOR,False,0.001140
81,2,4,2,EST,False,0.033887
82,2,4,2,EST,False,0.033887
101,1,3,1,AZE,False,0.801572
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.004727
204017,3,4,2,GRC,False,0.376415
204019,1,4,2,RUS,False,0.343258
204020,1,4,2,RUS,False,0.343258


### Women's Badminton Regression

In [364]:
# Preserve Badminton column
fBadminton_list = female_df['Badminton'].tolist()

In [365]:
# Delete all sport columns
fBadminton_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [366]:
# Add badminton column back in
fBadminton_df['Badminton'] = fBadminton_list

fBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Badminton
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [367]:
fBadminton_dummies = pd.get_dummies(fBadminton_df)
fBadminton_dummies.head(3)

Unnamed: 0,Badminton,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [368]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBadminton_dummies.drop(columns=['Badminton'])
y_train = fBadminton_dummies['Badminton']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBadminton_dummies.drop(columns=['Badminton'])
y_test = fBadminton_dummies['Badminton']

In [369]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9871020535401162

In [370]:
fBadminton_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBadminton_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBadminton_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBadminton_df)

array([6.69617761e-05, 9.93098559e-02, 7.17137509e-01, ...,
       5.60088125e-01, 5.60088125e-01, 3.11963919e-01])

Unnamed: 0,Age,Height,Weight,NOC,Badminton
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [371]:
fBadminton_pred = y_pred.tolist()

In [372]:
fBadminton_df['Women_Badminton_Prediction'] = fBadminton_pred

In [373]:
fBadminton_df

Unnamed: 0,Age,Height,Weight,NOC,Badminton,Women_Badminton_Prediction
51,1,4,6,ROU,False,0.000067
69,1,5,2,NOR,False,0.099310
81,2,4,2,EST,False,0.717138
82,2,4,2,EST,False,0.717138
101,1,3,1,AZE,False,0.013719
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.001745
204017,3,4,2,GRC,False,0.005760
204019,1,4,2,RUS,False,0.560088
204020,1,4,2,RUS,False,0.560088


### Women's Athletics Regression

In [374]:
# Preserve Athletics column
fAthletics_list = female_df['Athletics'].tolist()

In [375]:
# Delete all sport columns
fAthletics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [376]:
# Add Athletics column back in
fAthletics_df['Athletics'] = fAthletics_list

fAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Athletics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,True
82,2,4,2,EST,True
101,1,3,1,AZE,False


In [377]:
fAthletics_dummies = pd.get_dummies(fAthletics_df)
fAthletics_dummies.head(3)

Unnamed: 0,Athletics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,True,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [378]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fAthletics_dummies.drop(columns=['Athletics'])
y_train = fAthletics_dummies['Athletics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fAthletics_dummies.drop(columns=['Athletics'])
y_test = fAthletics_dummies['Athletics']

In [379]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.828780734910182

In [380]:
fAthletics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fAthletics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fAthletics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fAthletics_df)

array([0.64427895, 0.30408921, 0.69488694, ..., 0.45956683, 0.45956683,
       0.679498  ])

Unnamed: 0,Age,Height,Weight,NOC,Athletics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,True
82,2,4,2,EST,True
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,True
204017,3,4,2,GRC,False
204019,1,4,2,RUS,True
204020,1,4,2,RUS,True


In [381]:
fAthletics_pred = y_pred.tolist()

In [382]:
fAthletics_df['Women_Athletics_Prediction'] = fAthletics_pred

In [383]:
fAthletics_df

Unnamed: 0,Age,Height,Weight,NOC,Athletics,Women_Athletics_Prediction
51,1,4,6,ROU,False,0.644279
69,1,5,2,NOR,False,0.304089
81,2,4,2,EST,True,0.694887
82,2,4,2,EST,True,0.694887
101,1,3,1,AZE,False,0.238475
...,...,...,...,...,...,...
204000,3,4,3,URS,True,0.578113
204017,3,4,2,GRC,False,0.637841
204019,1,4,2,RUS,True,0.459567
204020,1,4,2,RUS,True,0.459567


### Women's Weightlifting Regression

In [384]:
# Preserve Weightlifting column
fWeightlifting_list = female_df['Weightlifting'].tolist()

In [385]:
# Delete all sport columns
fWeightlifting_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [386]:
# Add Weightlifting column back in
fWeightlifting_df['Weightlifting'] = fWeightlifting_list

fWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
51,1,4,6,ROU,True
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [387]:
fWeightlifting_dummies = pd.get_dummies(fWeightlifting_df)
fWeightlifting_dummies.head(3)

Unnamed: 0,Weightlifting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [388]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWeightlifting_dummies.drop(columns=['Weightlifting'])
y_train = fWeightlifting_dummies['Weightlifting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWeightlifting_dummies.drop(columns=['Weightlifting'])
y_test = fWeightlifting_dummies['Weightlifting']

In [389]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.991096130616062

In [390]:
fWeightlifting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWeightlifting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWeightlifting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWeightlifting_df)

array([0.98253974, 0.00387373, 0.01882631, ..., 0.28627168, 0.28627168,
       0.28151993])

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
51,1,4,6,ROU,True
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [391]:
fWeightlifting_pred = y_pred.tolist()

In [392]:
fWeightlifting_df['Women_Weightlifting_Prediction'] = fWeightlifting_pred

In [393]:
fWeightlifting_df

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting,Women_Weightlifting_Prediction
51,1,4,6,ROU,True,0.982540
69,1,5,2,NOR,False,0.003874
81,2,4,2,EST,False,0.018826
82,2,4,2,EST,False,0.018826
101,1,3,1,AZE,False,0.737881
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.004326
204017,3,4,2,GRC,False,0.247884
204019,1,4,2,RUS,False,0.286272
204020,1,4,2,RUS,False,0.286272


### Women's Wrestling Regression

In [394]:
# Preserve Wrestling column
fWrestling_list = female_df['Wrestling'].tolist()

In [395]:
# Delete all sport columns
fWrestling_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [396]:
# Add Wrestling column back in
fWrestling_df['Wrestling'] = fWrestling_list

fWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [397]:
fWrestling_dummies = pd.get_dummies(fWrestling_df)
fWrestling_dummies.head(3)

Unnamed: 0,Wrestling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [398]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWrestling_dummies.drop(columns=['Wrestling'])
y_train = fWrestling_dummies['Wrestling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWrestling_dummies.drop(columns=['Wrestling'])
y_test = fWrestling_dummies['Wrestling']

In [399]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9940965592487239

In [400]:
fWrestling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWrestling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWrestling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWrestling_df)

array([0.01845934, 0.11295265, 0.58676824, ..., 0.55966432, 0.55966432,
       0.77969913])

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,True
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [401]:
fWrestling_pred = y_pred.tolist()

In [402]:
fWrestling_df['Women_Wrestling_Prediction'] = fWrestling_pred

In [403]:
fWrestling_df

Unnamed: 0,Age,Height,Weight,NOC,Wrestling,Women_Wrestling_Prediction
51,1,4,6,ROU,False,0.018459
69,1,5,2,NOR,False,0.112953
81,2,4,2,EST,False,0.586768
82,2,4,2,EST,False,0.586768
101,1,3,1,AZE,False,0.946284
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.006858
204017,3,4,2,GRC,True,0.724201
204019,1,4,2,RUS,False,0.559664
204020,1,4,2,RUS,False,0.559664


### Women's Rowing Regression

In [404]:
# Preserve Rowing column
fRowing_list = female_df['Rowing'].tolist()

In [405]:
# Delete all sport columns
fRowing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [406]:
# Add Rowing column back in
fRowing_df['Rowing'] = fRowing_list

fRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rowing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [407]:
fRowing_dummies = pd.get_dummies(fRowing_df)
fRowing_dummies.head(3)

Unnamed: 0,Rowing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [408]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRowing_dummies.drop(columns=['Rowing'])
y_train = fRowing_dummies['Rowing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRowing_dummies.drop(columns=['Rowing'])
y_test = fRowing_dummies['Rowing']

In [409]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9599228461208744

In [410]:
fRowing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRowing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRowing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRowing_df)

array([0.05562219, 0.34601476, 0.01505561, ..., 0.10639681, 0.10639681,
       0.47629225])

Unnamed: 0,Age,Height,Weight,NOC,Rowing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [411]:
fRowing_pred = y_pred.tolist()

In [412]:
fRowing_df['Women_Rowing_Prediction'] = fRowing_pred

In [413]:
fRowing_df

Unnamed: 0,Age,Height,Weight,NOC,Rowing,Women_Rowing_Prediction
51,1,4,6,ROU,False,0.055622
69,1,5,2,NOR,False,0.346015
81,2,4,2,EST,False,0.015056
82,2,4,2,EST,False,0.015056
101,1,3,1,AZE,False,0.033527
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.846912
204017,3,4,2,GRC,False,0.414010
204019,1,4,2,RUS,False,0.106397
204020,1,4,2,RUS,False,0.106397


### Women's Swimming Regression

In [414]:
# Preserve Swimming column
fSwimming_list = female_df['Swimming'].tolist()

In [415]:
# Delete all sport columns
fSwimming_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [416]:
# Add Swimming column back in
fSwimming_df['Swimming'] = fSwimming_list

fSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Swimming
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [417]:
fSwimming_dummies = pd.get_dummies(fSwimming_df)
fSwimming_dummies.head(3)

Unnamed: 0,Swimming,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [418]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSwimming_dummies.drop(columns=['Swimming'])
y_train = fSwimming_dummies['Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSwimming_dummies.drop(columns=['Swimming'])
y_test = fSwimming_dummies['Swimming']

In [419]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.8699294704438296

In [420]:
fSwimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSwimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSwimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSwimming_df)

array([0.02271105, 0.58712472, 0.31847591, ..., 0.59670206, 0.59670206,
       0.05089363])

Unnamed: 0,Age,Height,Weight,NOC,Swimming
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [421]:
fSwimming_pred = y_pred.tolist()

In [422]:
fSwimming_df['Women_Swimming_Prediction'] = fSwimming_pred

In [423]:
fSwimming_df

Unnamed: 0,Age,Height,Weight,NOC,Swimming,Women_Swimming_Prediction
51,1,4,6,ROU,False,0.022711
69,1,5,2,NOR,False,0.587125
81,2,4,2,EST,False,0.318476
82,2,4,2,EST,False,0.318476
101,1,3,1,AZE,False,0.260612
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.024531
204017,3,4,2,GRC,False,0.077405
204019,1,4,2,RUS,False,0.596702
204020,1,4,2,RUS,False,0.596702


### Women's Football Regression

In [424]:
# Preserve Football column
fFootball_list = female_df['Football'].tolist()

In [425]:
# Delete all sport columns
fFootball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [426]:
# Add Football column back in
fFootball_df['Football'] = fFootball_list

fFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Football
51,1,4,6,ROU,False
69,1,5,2,NOR,True
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [427]:
fFootball_dummies = pd.get_dummies(fFootball_df)
fFootball_dummies.head(3)

Unnamed: 0,Football,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [428]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fFootball_dummies.drop(columns=['Football'])
y_train = fFootball_dummies['Football']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fFootball_dummies.drop(columns=['Football'])
y_test = fFootball_dummies['Football']

In [429]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9808284300354596

In [430]:
fFootball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fFootball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fFootball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fFootball_df)

array([9.44141374e-05, 7.90493096e-01, 3.20917126e-03, ...,
       1.73021000e-03, 1.73021000e-03, 2.86164914e-03])

Unnamed: 0,Age,Height,Weight,NOC,Football
51,1,4,6,ROU,False
69,1,5,2,NOR,True
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [431]:
fFootball_pred = y_pred.tolist()

In [432]:
fFootball_df['Women_Football_Prediction'] = fFootball_pred

In [433]:
fFootball_df

Unnamed: 0,Age,Height,Weight,NOC,Football,Women_Football_Prediction
51,1,4,6,ROU,False,0.000094
69,1,5,2,NOR,True,0.790493
81,2,4,2,EST,False,0.003209
82,2,4,2,EST,False,0.003209
101,1,3,1,AZE,False,0.001655
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.001913
204017,3,4,2,GRC,False,0.744693
204019,1,4,2,RUS,False,0.001730
204020,1,4,2,RUS,False,0.001730


### Women's Equestrianism Regression

In [434]:
# Preserve Equestrianism column
fEquestrianism_list = female_df['Equestrianism'].tolist()

In [435]:
# Delete all sport columns
fEquestrianism_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [436]:
# Add Equestrianism column back in
fEquestrianism_df['Equestrianism'] = fEquestrianism_list

fEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [437]:
fEquestrianism_dummies = pd.get_dummies(fEquestrianism_df)
fEquestrianism_dummies.head(3)

Unnamed: 0,Equestrianism,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [438]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fEquestrianism_dummies.drop(columns=['Equestrianism'])
y_train = fEquestrianism_dummies['Equestrianism']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fEquestrianism_dummies.drop(columns=['Equestrianism'])
y_test = fEquestrianism_dummies['Equestrianism']

In [439]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.979971164711842

In [440]:
fEquestrianism_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fEquestrianism_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fEquestrianism_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fEquestrianism_df)

array([0.00459799, 0.00085503, 0.01410746, ..., 0.07108958, 0.07108958,
       0.46066987])

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [441]:
fEquestrianism_pred = y_pred.tolist()

In [442]:
fEquestrianism_df['Women_Equestrianism_Prediction'] = fEquestrianism_pred

In [443]:
fEquestrianism_df

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism,Women_Equestrianism_Prediction
51,1,4,6,ROU,False,0.004598
69,1,5,2,NOR,False,0.000855
81,2,4,2,EST,False,0.014107
82,2,4,2,EST,False,0.014107
101,1,3,1,AZE,False,0.001498
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.169178
204017,3,4,2,GRC,False,0.781516
204019,1,4,2,RUS,False,0.071090
204020,1,4,2,RUS,False,0.071090


### Women's Shooting Regression

In [444]:
# Preserve Shooting column
fShooting_list = female_df['Shooting'].tolist()

In [445]:
# Delete all sport columns
fShooting_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [446]:
# Add Shooting column back in
fShooting_df['Shooting'] = fShooting_list

fShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Shooting
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [447]:
fShooting_dummies = pd.get_dummies(fShooting_df)
fShooting_dummies.head(3)

Unnamed: 0,Shooting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [448]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fShooting_dummies.drop(columns=['Shooting'])
y_train = fShooting_dummies['Shooting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fShooting_dummies.drop(columns=['Shooting'])
y_test = fShooting_dummies['Shooting']

In [449]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.968612399173908

In [450]:
fShooting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fShooting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fShooting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fShooting_df)

array([0.00536593, 0.20229796, 0.32355774, ..., 0.35336795, 0.35336795,
       0.8292166 ])

Unnamed: 0,Age,Height,Weight,NOC,Shooting
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [451]:
fShooting_pred = y_pred.tolist()

In [452]:
fShooting_df['Women_Shooting_Prediction'] = fShooting_pred

In [453]:
fShooting_df

Unnamed: 0,Age,Height,Weight,NOC,Shooting,Women_Shooting_Prediction
51,1,4,6,ROU,False,0.005366
69,1,5,2,NOR,False,0.202298
81,2,4,2,EST,False,0.323558
82,2,4,2,EST,False,0.323558
101,1,3,1,AZE,False,0.317261
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.718702
204017,3,4,2,GRC,False,0.739487
204019,1,4,2,RUS,False,0.353368
204020,1,4,2,RUS,False,0.353368


### Women's Gymnastics Regression

In [454]:
# Preserve Gymnastics column
fGymnastics_list = female_df['Gymnastics'].tolist()

In [455]:
# Delete all sport columns
fGymnastics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [456]:
# Add Gymnastics column back in
fGymnastics_df['Gymnastics'] = fGymnastics_list

fGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [457]:
fGymnastics_dummies = pd.get_dummies(fGymnastics_df)
fGymnastics_dummies.head(3)

Unnamed: 0,Gymnastics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [458]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fGymnastics_dummies.drop(columns=['Gymnastics'])
y_train = fGymnastics_dummies['Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fGymnastics_dummies.drop(columns=['Gymnastics'])
y_test = fGymnastics_dummies['Gymnastics']

In [459]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9315356739274442

In [460]:
fGymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fGymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fGymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fGymnastics_df)

array([0.18075826, 0.03310966, 0.00054495, ..., 0.03651706, 0.03651706,
       0.00101684])

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [461]:
fGymnastics_pred = y_pred.tolist()

In [462]:
fGymnastics_df['Women_Gymnastics_Prediction'] = fGymnastics_pred

In [463]:
fGymnastics_df

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics,Women_Gymnastics_Prediction
51,1,4,6,ROU,False,0.180758
69,1,5,2,NOR,False,0.033110
81,2,4,2,EST,False,0.000545
82,2,4,2,EST,False,0.000545
101,1,3,1,AZE,False,0.124524
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.000067
204017,3,4,2,GRC,False,0.000590
204019,1,4,2,RUS,False,0.036517
204020,1,4,2,RUS,False,0.036517


### Women's Taekwondo Regression

In [464]:
# Preserve Taekwondo column
fTaekwondo_list = female_df['Taekwondo'].tolist()

In [465]:
# Delete all sport columns
fTaekwondo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [466]:
# Add Taekwondo column back in
fTaekwondo_df['Taekwondo'] = fTaekwondo_list

fTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,True


In [467]:
fTaekwondo_dummies = pd.get_dummies(fTaekwondo_df)
fTaekwondo_dummies.head(3)

Unnamed: 0,Taekwondo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [468]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTaekwondo_dummies.drop(columns=['Taekwondo'])
y_train = fTaekwondo_dummies['Taekwondo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTaekwondo_dummies.drop(columns=['Taekwondo'])
y_test = fTaekwondo_dummies['Taekwondo']

In [469]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9943498421852472

In [470]:
fTaekwondo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTaekwondo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTaekwondo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTaekwondo_df)

array([2.02827961e-05, 7.21496852e-01, 2.19871195e-02, ...,
       4.34544599e-01, 4.34544599e-01, 2.89056025e-03])

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,True
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [471]:
fTaekwondo_pred = y_pred.tolist()

In [472]:
fTaekwondo_df['Women_Taekwondo_Prediction'] = fTaekwondo_pred

In [473]:
fTaekwondo_df

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo,Women_Taekwondo_Prediction
51,1,4,6,ROU,False,0.000020
69,1,5,2,NOR,False,0.721497
81,2,4,2,EST,False,0.021987
82,2,4,2,EST,False,0.021987
101,1,3,1,AZE,True,0.935828
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.001639
204017,3,4,2,GRC,False,0.512600
204019,1,4,2,RUS,False,0.434545
204020,1,4,2,RUS,False,0.434545


### Women's Boxing Regression

In [474]:
# Preserve Boxing column
fBoxing_list = female_df['Boxing'].tolist()

In [475]:
# Delete all sport columns
fBoxing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [476]:
# Add Boxing column back in
fBoxing_df['Boxing'] = fBoxing_list

fBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Boxing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [477]:
fBoxing_dummies = pd.get_dummies(fBoxing_df)
fBoxing_dummies.head(3)

Unnamed: 0,Boxing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [478]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBoxing_dummies.drop(columns=['Boxing'])
y_train = fBoxing_dummies['Boxing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBoxing_dummies.drop(columns=['Boxing'])
y_test = fBoxing_dummies['Boxing']

In [479]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9988504851342399

In [480]:
fBoxing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBoxing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBoxing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBoxing_df)

array([1.56180947e-05, 2.09639619e-04, 1.93072537e-03, ...,
       4.67803650e-01, 4.67803650e-01, 2.17938028e-03])

Unnamed: 0,Age,Height,Weight,NOC,Boxing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [481]:
fBoxing_pred = y_pred.tolist()

In [482]:
fBoxing_df['Women_Boxing_Prediction'] = fBoxing_pred

In [483]:
fBoxing_df

Unnamed: 0,Age,Height,Weight,NOC,Boxing,Women_Boxing_Prediction
51,1,4,6,ROU,False,0.000016
69,1,5,2,NOR,False,0.000210
81,2,4,2,EST,False,0.001931
82,2,4,2,EST,False,0.001931
101,1,3,1,AZE,False,0.915797
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.023992
204017,3,4,2,GRC,False,0.002022
204019,1,4,2,RUS,False,0.467804
204020,1,4,2,RUS,False,0.467804


### Women's Fencing Regression

In [484]:
# Preserve Fencing column
fFencing_list = female_df['Fencing'].tolist()

In [485]:
# Delete all sport columns
fFencing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [486]:
# Add Fencing column back in
fFencing_df['Fencing'] = fFencing_list

fFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Fencing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [487]:
fFencing_dummies = pd.get_dummies(fFencing_df)
fFencing_dummies.head(3)

Unnamed: 0,Fencing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [488]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fFencing_dummies.drop(columns=['Fencing'])
y_train = fFencing_dummies['Fencing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fFencing_dummies.drop(columns=['Fencing'])
y_test = fFencing_dummies['Fencing']

In [489]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9660990531114835

In [490]:
fFencing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fFencing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fFencing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fFencing_df)

array([0.04798111, 0.26361201, 0.89965773, ..., 0.64202156, 0.64202156,
       0.12085226])

Unnamed: 0,Age,Height,Weight,NOC,Fencing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [491]:
fFencing_pred = y_pred.tolist()

In [492]:
fFencing_df['Women_Fencing_Prediction'] = fFencing_pred

In [493]:
fFencing_df

Unnamed: 0,Age,Height,Weight,NOC,Fencing,Women_Fencing_Prediction
51,1,4,6,ROU,False,0.047981
69,1,5,2,NOR,False,0.263612
81,2,4,2,EST,False,0.899658
82,2,4,2,EST,False,0.899658
101,1,3,1,AZE,False,0.437322
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.618081
204017,3,4,2,GRC,False,0.587443
204019,1,4,2,RUS,False,0.642022
204020,1,4,2,RUS,False,0.642022


### Women's Diving Regression

In [494]:
# Preserve Diving column
fDiving_list = female_df['Diving'].tolist()

In [495]:
# Delete all sport columns
fDiving_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [496]:
# Add Diving column back in
fDiving_df['Diving'] = fDiving_list

fDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Diving
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [497]:
fDiving_dummies = pd.get_dummies(fDiving_df)
fDiving_dummies.head(3)

Unnamed: 0,Diving,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [498]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fDiving_dummies.drop(columns=['Diving'])
y_train = fDiving_dummies['Diving']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fDiving_dummies.drop(columns=['Diving'])
y_test = fDiving_dummies['Diving']

In [499]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9818220784787437

In [500]:
fDiving_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fDiving_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fDiving_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fDiving_df)

array([0.08038667, 0.02680919, 0.00906137, ..., 0.45758622, 0.45758622,
       0.05656782])

Unnamed: 0,Age,Height,Weight,NOC,Diving
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [501]:
fDiving_pred = y_pred.tolist()

In [502]:
fDiving_df['Women_Diving_Prediction'] = fDiving_pred

In [503]:
fDiving_df

Unnamed: 0,Age,Height,Weight,NOC,Diving,Women_Diving_Prediction
51,1,4,6,ROU,False,0.080387
69,1,5,2,NOR,False,0.026809
81,2,4,2,EST,False,0.009061
82,2,4,2,EST,False,0.009061
101,1,3,1,AZE,False,0.059843
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.013594
204017,3,4,2,GRC,False,0.133302
204019,1,4,2,RUS,False,0.457586
204020,1,4,2,RUS,False,0.457586


### Women's Canoeing Regression

In [504]:
# Preserve Canoeing column
fCanoeing_list = female_df['Canoeing'].tolist()

In [505]:
# Delete all sport columns
fCanoeing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [506]:
# Add Canoeing column back in
fCanoeing_df['Canoeing'] = fCanoeing_list

fCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [507]:
fCanoeing_dummies = pd.get_dummies(fCanoeing_df)
fCanoeing_dummies.head(3)

Unnamed: 0,Canoeing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [508]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fCanoeing_dummies.drop(columns=['Canoeing'])
y_train = fCanoeing_dummies['Canoeing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fCanoeing_dummies.drop(columns=['Canoeing'])
y_test = fCanoeing_dummies['Canoeing']

In [509]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9752367221291354

In [510]:
fCanoeing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fCanoeing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fCanoeing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fCanoeing_df)

array([0.10922718, 0.52777246, 0.05301432, ..., 0.58955974, 0.58955974,
       0.83530045])

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [511]:
fCanoeing_pred = y_pred.tolist()

In [512]:
fCanoeing_df['Women_Canoeing_Prediction'] = fCanoeing_pred

In [513]:
fCanoeing_df

Unnamed: 0,Age,Height,Weight,NOC,Canoeing,Women_Canoeing_Prediction
51,1,4,6,ROU,False,0.109227
69,1,5,2,NOR,False,0.527772
81,2,4,2,EST,False,0.053014
82,2,4,2,EST,False,0.053014
101,1,3,1,AZE,False,0.186106
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.526178
204017,3,4,2,GRC,False,0.361251
204019,1,4,2,RUS,False,0.589560
204020,1,4,2,RUS,False,0.589560


### Women's Handball Regression

In [514]:
# Preserve Handball column
fHandball_list = female_df['Handball'].tolist()

In [515]:
# Delete all sport columns
fHandball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [516]:
# Add Handball column back in
fHandball_df['Handball'] = fHandball_list

fHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Handball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [517]:
fHandball_dummies = pd.get_dummies(fHandball_df)
fHandball_dummies.head(3)

Unnamed: 0,Handball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [518]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fHandball_dummies.drop(columns=['Handball'])
y_train = fHandball_dummies['Handball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fHandball_dummies.drop(columns=['Handball'])
y_test = fHandball_dummies['Handball']

In [519]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9762888204808479

In [520]:
fHandball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fHandball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fHandball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fHandball_df)

array([0.07797651, 0.91908208, 0.0124314 , ..., 0.48685711, 0.48685711,
       0.00754891])

Unnamed: 0,Age,Height,Weight,NOC,Handball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [521]:
fHandball_pred = y_pred.tolist()

In [522]:
fHandball_df['Women_Handball_Prediction'] = fHandball_pred

In [523]:
fHandball_df

Unnamed: 0,Age,Height,Weight,NOC,Handball,Women_Handball_Prediction
51,1,4,6,ROU,False,0.077977
69,1,5,2,NOR,False,0.919082
81,2,4,2,EST,False,0.012431
82,2,4,2,EST,False,0.012431
101,1,3,1,AZE,False,0.000425
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.853221
204017,3,4,2,GRC,False,0.687495
204019,1,4,2,RUS,False,0.486857
204020,1,4,2,RUS,False,0.486857


### Women's Water Polo Regression

In [524]:
# Preserve Water Polo column
fWater_Polo_list = female_df['Water Polo'].tolist()

In [525]:
# Delete all sport columns
fWater_Polo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [526]:
# Add Water Polo column back in
fWater_Polo_df['Water Polo'] = fWater_Polo_list

fWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [527]:
fWater_Polo_dummies = pd.get_dummies(fWater_Polo_df)
fWater_Polo_dummies.head(3)

Unnamed: 0,Water Polo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [528]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWater_Polo_dummies.drop(columns=['Water Polo'])
y_train = fWater_Polo_dummies['Water Polo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWater_Polo_dummies.drop(columns=['Water Polo'])
y_test = fWater_Polo_dummies['Water Polo']

In [529]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9904921482289678

In [530]:
fWater_Polo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWater_Polo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWater_Polo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWater_Polo_df)

array([3.39727579e-05, 1.70957992e-03, 1.46109885e-03, ...,
       8.93938141e-01, 8.93938141e-01, 8.51641204e-04])

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [531]:
fWater_Polo_pred = y_pred.tolist()

In [532]:
fWater_Polo_df['Women_Water_Polo_Prediction'] = fWater_Polo_pred

In [533]:
fWater_Polo_df

Unnamed: 0,Age,Height,Weight,NOC,Water Polo,Women_Water_Polo_Prediction
51,1,4,6,ROU,False,0.000034
69,1,5,2,NOR,False,0.001710
81,2,4,2,EST,False,0.001461
82,2,4,2,EST,False,0.001461
101,1,3,1,AZE,False,0.000027
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.002304
204017,3,4,2,GRC,False,0.838045
204019,1,4,2,RUS,False,0.893938
204020,1,4,2,RUS,False,0.893938


### Women's Tennis Regression

In [534]:
# Preserve Tennis column
fTennis_list = female_df['Tennis'].tolist()

In [535]:
# Delete all sport columns
fTennis_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [536]:
# Add Tennis column back in
fTennis_df['Tennis'] = fTennis_list

fTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Tennis
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [537]:
fTennis_dummies = pd.get_dummies(fTennis_df)
fTennis_dummies.head(3)

Unnamed: 0,Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [538]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTennis_dummies.drop(columns=['Tennis'])
y_train = fTennis_dummies['Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTennis_dummies.drop(columns=['Tennis'])
y_test = fTennis_dummies['Tennis']

In [539]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9812180960916494

In [540]:
fTennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTennis_df)

array([0.02180457, 0.01103575, 0.88946948, ..., 0.68597014, 0.68597014,
       0.60934675])

Unnamed: 0,Age,Height,Weight,NOC,Tennis
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [541]:
fTennis_pred = y_pred.tolist()

In [542]:
fTennis_df['Women_Tennis_Prediction'] = fTennis_pred

In [543]:
fTennis_df

Unnamed: 0,Age,Height,Weight,NOC,Tennis,Women_Tennis_Prediction
51,1,4,6,ROU,False,0.021805
69,1,5,2,NOR,False,0.011036
81,2,4,2,EST,False,0.889469
82,2,4,2,EST,False,0.889469
101,1,3,1,AZE,False,0.017064
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.085381
204017,3,4,2,GRC,False,0.625982
204019,1,4,2,RUS,False,0.685970
204020,1,4,2,RUS,False,0.685970


### Women's Cycling Regression

In [544]:
# Preserve Cycling column
fCycling_list = female_df['Cycling'].tolist()

In [545]:
# Delete all sport columns
fCycling_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [546]:
# Add Cycling column back in
fCycling_df['Cycling'] = fCycling_list

fCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Cycling
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [547]:
fCycling_dummies = pd.get_dummies(fCycling_df)
fCycling_dummies.head(3)

Unnamed: 0,Cycling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [548]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fCycling_dummies.drop(columns=['Cycling'])
y_train = fCycling_dummies['Cycling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fCycling_dummies.drop(columns=['Cycling'])
y_test = fCycling_dummies['Cycling']

In [549]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9739313408408994

In [550]:
fCycling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fCycling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fCycling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fCycling_df)

array([1.03648436e-04, 6.20898483e-01, 7.46799512e-01, ...,
       6.07020446e-01, 6.07020446e-01, 7.91224910e-01])

Unnamed: 0,Age,Height,Weight,NOC,Cycling
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [551]:
fCycling_pred = y_pred.tolist()

In [552]:
fCycling_df['Women_Cycling_Prediction'] = fCycling_pred

In [553]:
fCycling_df

Unnamed: 0,Age,Height,Weight,NOC,Cycling,Women_Cycling_Prediction
51,1,4,6,ROU,False,0.000104
69,1,5,2,NOR,False,0.620898
81,2,4,2,EST,False,0.746800
82,2,4,2,EST,False,0.746800
101,1,3,1,AZE,False,0.688040
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.123856
204017,3,4,2,GRC,False,0.158945
204019,1,4,2,RUS,False,0.607020
204020,1,4,2,RUS,False,0.607020


### Women's Hockey Regression

In [554]:
# Preserve Hockey column
fHockey_list = female_df['Hockey'].tolist()

In [555]:
# Delete all sport columns
fHockey_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [556]:
# Add Hockey column back in
fHockey_df['Hockey'] = fHockey_list

fHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Hockey
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [557]:
fHockey_dummies = pd.get_dummies(fHockey_df)
fHockey_dummies.head(3)

Unnamed: 0,Hockey,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [558]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fHockey_dummies.drop(columns=['Hockey'])
y_train = fHockey_dummies['Hockey']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fHockey_dummies.drop(columns=['Hockey'])
y_test = fHockey_dummies['Hockey']

In [559]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9733078751509956

In [560]:
fHockey_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fHockey_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fHockey_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fHockey_df)

array([0.00014197, 0.00052838, 0.00397662, ..., 0.00146686, 0.00146686,
       0.00326219])

Unnamed: 0,Age,Height,Weight,NOC,Hockey
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [561]:
fHockey_pred = y_pred.tolist()

In [562]:
fHockey_df['Women_Hockey_Prediction'] = fHockey_pred

In [563]:
fHockey_df

Unnamed: 0,Age,Height,Weight,NOC,Hockey,Women_Hockey_Prediction
51,1,4,6,ROU,False,0.000142
69,1,5,2,NOR,False,0.000528
81,2,4,2,EST,False,0.003977
82,2,4,2,EST,False,0.003977
101,1,3,1,AZE,False,0.001555
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.225058
204017,3,4,2,GRC,False,0.003262
204019,1,4,2,RUS,False,0.001467
204020,1,4,2,RUS,False,0.001467


### Women's Archery Regression

In [564]:
# Preserve Archery column
fArchery_list = female_df['Archery'].tolist()

In [565]:
# Delete all sport columns
fArchery_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [566]:
# Add Archery column back in
fArchery_df['Archery'] = fArchery_list

fArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Archery
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [567]:
fArchery_dummies = pd.get_dummies(fArchery_df)
fArchery_dummies.head(3)

Unnamed: 0,Archery,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [568]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fArchery_dummies.drop(columns=['Archery'])
y_train = fArchery_dummies['Archery']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fArchery_dummies.drop(columns=['Archery'])
y_test = fArchery_dummies['Archery']

In [569]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9827183104079804

In [570]:
fArchery_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fArchery_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fArchery_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fArchery_df)

array([0.00553864, 0.15462644, 0.68706716, ..., 0.64138045, 0.64138045,
       0.52001299])

Unnamed: 0,Age,Height,Weight,NOC,Archery
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [571]:
fArchery_pred = y_pred.tolist()

In [572]:
fArchery_df['Women_Archery_Prediction'] = fArchery_pred

In [573]:
fArchery_df

Unnamed: 0,Age,Height,Weight,NOC,Archery,Women_Archery_Prediction
51,1,4,6,ROU,False,0.005539
69,1,5,2,NOR,False,0.154626
81,2,4,2,EST,False,0.687067
82,2,4,2,EST,False,0.687067
101,1,3,1,AZE,False,0.594159
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.684232
204017,3,4,2,GRC,False,0.731854
204019,1,4,2,RUS,False,0.641380
204020,1,4,2,RUS,False,0.641380


### Women's Softball Regression

In [574]:
# Preserve Softball column
fSoftball_list = female_df['Softball'].tolist()

In [575]:
# Delete all sport columns
fSoftball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSoftball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [576]:
# Add Softball column back in
fSoftball_df['Softball'] = fSoftball_list

fSoftball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Softball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [577]:
fSoftball_dummies = pd.get_dummies(fSoftball_df)
fSoftball_dummies.head(3)

Unnamed: 0,Softball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [578]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSoftball_dummies.drop(columns=['Softball'])
y_train = fSoftball_dummies['Softball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSoftball_dummies.drop(columns=['Softball'])
y_test = fSoftball_dummies['Softball']

In [579]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.991407863461014

In [580]:
fSoftball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSoftball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSoftball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSoftball_df)

array([1.75852530e-05, 2.44030504e-04, 1.53654471e-03, ...,
       7.22507450e-04, 7.22507450e-04, 2.04143626e-03])

Unnamed: 0,Age,Height,Weight,NOC,Softball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [581]:
fSoftball_pred = y_pred.tolist()

In [582]:
fSoftball_df['Women_Softball_Prediction'] = fSoftball_pred

In [583]:
fSoftball_df

Unnamed: 0,Age,Height,Weight,NOC,Softball,Women_Softball_Prediction
51,1,4,6,ROU,False,0.000018
69,1,5,2,NOR,False,0.000244
81,2,4,2,EST,False,0.001537
82,2,4,2,EST,False,0.001537
101,1,3,1,AZE,False,0.000094
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.006138
204017,3,4,2,GRC,False,0.897406
204019,1,4,2,RUS,False,0.000723
204020,1,4,2,RUS,False,0.000723


### Women's Volleyball Regression

In [584]:
# Preserve Volleyball column
fVolleyball_list = female_df['Volleyball'].tolist()

In [585]:
# Delete all sport columns
fVolleyball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [586]:
# Add Volleyball column back in
fVolleyball_df['Volleyball'] = fVolleyball_list

fVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [587]:
fVolleyball_dummies = pd.get_dummies(fVolleyball_df)
fVolleyball_dummies.head(3)

Unnamed: 0,Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [588]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fVolleyball_dummies.drop(columns=['Volleyball'])
y_train = fVolleyball_dummies['Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fVolleyball_dummies.drop(columns=['Volleyball'])
y_test = fVolleyball_dummies['Volleyball']

In [589]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9728207925807583

In [590]:
fVolleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fVolleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fVolleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fVolleyball_df)

array([0.00179826, 0.01049713, 0.00864608, ..., 0.27661843, 0.27661843,
       0.00156231])

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [591]:
fVolleyball_pred = y_pred.tolist()

In [592]:
fVolleyball_df['Women_Volleyball_Prediction'] = fVolleyball_pred

In [593]:
fVolleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Volleyball,Women_Volleyball_Prediction
51,1,4,6,ROU,False,0.001798
69,1,5,2,NOR,False,0.010497
81,2,4,2,EST,False,0.008646
82,2,4,2,EST,False,0.008646
101,1,3,1,AZE,False,0.000249
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.710422
204017,3,4,2,GRC,False,0.226568
204019,1,4,2,RUS,False,0.276618
204020,1,4,2,RUS,False,0.276618


### Women's Modern Pentathlon Regression

In [594]:
# Preserve Modern Pentathlon column
fModern_Pentathlon_list = female_df['Modern Pentathlon'].tolist()

In [595]:
# Delete all sport columns
fModern_Pentathlon_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [596]:
# Add Modern Pentathlon column back in
fModern_Pentathlon_df['Modern Pentathlon'] = fModern_Pentathlon_list

fModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [597]:
fModern_Pentathlon_dummies = pd.get_dummies(fModern_Pentathlon_df)
fModern_Pentathlon_dummies.head(3)

Unnamed: 0,Modern Pentathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [598]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_train = fModern_Pentathlon_dummies['Modern Pentathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_test = fModern_Pentathlon_dummies['Modern Pentathlon']

In [599]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9968047383392433

In [600]:
fModern_Pentathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fModern_Pentathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fModern_Pentathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fModern_Pentathlon_df)

array([2.02071121e-05, 6.86041087e-03, 9.71773070e-03, ...,
       7.22500446e-01, 7.22500446e-01, 8.85678120e-01])

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [601]:
fModern_Pentathlon_pred = y_pred.tolist()

In [602]:
fModern_Pentathlon_df['Women_Modern_Pentathlon_Prediction'] = fModern_Pentathlon_pred

In [603]:
fModern_Pentathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon,Women_Modern_Pentathlon_Prediction
51,1,4,6,ROU,False,0.000020
69,1,5,2,NOR,False,0.006860
81,2,4,2,EST,False,0.009718
82,2,4,2,EST,False,0.009718
101,1,3,1,AZE,False,0.004606
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.000131
204017,3,4,2,GRC,False,0.619385
204019,1,4,2,RUS,False,0.722500
204020,1,4,2,RUS,False,0.722500


### Women's Table Tennis Regression

In [604]:
# Preserve Table Tennis column
fTable_Tennis_list = female_df['Table Tennis'].tolist()

In [605]:
# Delete all sport columns
fTable_Tennis_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [606]:
# Add Table Tennis column back in
fTable_Tennis_df['Table Tennis'] = fTable_Tennis_list

fTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [607]:
fTable_Tennis_dummies = pd.get_dummies(fTable_Tennis_df)
fTable_Tennis_dummies.head(3)

Unnamed: 0,Table Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [608]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_train = fTable_Tennis_dummies['Table Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_test = fTable_Tennis_dummies['Table Tennis']

In [609]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9832833261894557

In [610]:
fTable_Tennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTable_Tennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTable_Tennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTable_Tennis_df)

array([0.03889941, 0.00183494, 0.02844038, ..., 0.40507508, 0.40507508,
       0.88315747])

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [611]:
fTable_Tennis_pred = y_pred.tolist()

In [612]:
fTable_Tennis_df['Women_Table_Tennis_Prediction'] = fTable_Tennis_pred

In [613]:
fTable_Tennis_df

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis,Women_Table_Tennis_Prediction
51,1,4,6,ROU,False,0.038899
69,1,5,2,NOR,False,0.001835
81,2,4,2,EST,False,0.028440
82,2,4,2,EST,False,0.028440
101,1,3,1,AZE,False,0.024597
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.211905
204017,3,4,2,GRC,False,0.452018
204019,1,4,2,RUS,False,0.405075
204020,1,4,2,RUS,False,0.405075


### Women's Synchronized Swimming Regression

In [614]:
# Preserve Synchronized Swimming column
fSynchronized_Swimming_list = female_df['Synchronized Swimming'].tolist()

In [615]:
# Delete all sport columns
fSynchronized_Swimming_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSynchronized_Swimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [616]:
# Add Synchronized Swimming column back in
fSynchronized_Swimming_df['Synchronized Swimming'] = fSynchronized_Swimming_list

fSynchronized_Swimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Synchronized Swimming
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [617]:
fSynchronized_Swimming_dummies = pd.get_dummies(fSynchronized_Swimming_df)
fSynchronized_Swimming_dummies.head(3)

Unnamed: 0,Synchronized Swimming,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [618]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSynchronized_Swimming_dummies.drop(columns=['Synchronized Swimming'])
y_train = fSynchronized_Swimming_dummies['Synchronized Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSynchronized_Swimming_dummies.drop(columns=['Synchronized Swimming'])
y_test = fSynchronized_Swimming_dummies['Synchronized Swimming']

In [619]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9835171258231695

In [620]:
fSynchronized_Swimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSynchronized_Swimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSynchronized_Swimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSynchronized_Swimming_df)

array([6.08569225e-05, 5.83211036e-03, 8.40739437e-03, ...,
       8.36516550e-01, 8.36516550e-01, 1.18888138e-01])

Unnamed: 0,Age,Height,Weight,NOC,Synchronized Swimming
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [621]:
fSynchronized_Swimming_pred = y_pred.tolist()

In [622]:
fSynchronized_Swimming_df['Women_Synchronized_Swimming_Prediction'] = fSynchronized_Swimming_pred

In [623]:
fSynchronized_Swimming_df

Unnamed: 0,Age,Height,Weight,NOC,Synchronized Swimming,Women_Synchronized_Swimming_Prediction
51,1,4,6,ROU,False,0.000061
69,1,5,2,NOR,False,0.005832
81,2,4,2,EST,False,0.008407
82,2,4,2,EST,False,0.008407
101,1,3,1,AZE,False,0.019487
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.000026
204017,3,4,2,GRC,False,0.240000
204019,1,4,2,RUS,False,0.836517
204020,1,4,2,RUS,False,0.836517


### Women's Rhythmic Gymnastics Regression

In [624]:
# Preserve Rhythmic Gymnastics column
fRhythmic_Gymnastics_list = female_df['Rhythmic Gymnastics'].tolist()

In [625]:
# Delete all sport columns
fRhythmic_Gymnastics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRhythmic_Gymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [626]:
# Add Rhythmic Gymnastics column back in
fRhythmic_Gymnastics_df['Rhythmic Gymnastics'] = fRhythmic_Gymnastics_list

fRhythmic_Gymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rhythmic Gymnastics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [627]:
fRhythmic_Gymnastics_dummies = pd.get_dummies(fRhythmic_Gymnastics_df)
fRhythmic_Gymnastics_dummies.head(3)

Unnamed: 0,Rhythmic Gymnastics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [628]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRhythmic_Gymnastics_dummies.drop(columns=['Rhythmic Gymnastics'])
y_train = fRhythmic_Gymnastics_dummies['Rhythmic Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRhythmic_Gymnastics_dummies.drop(columns=['Rhythmic Gymnastics'])
y_test = fRhythmic_Gymnastics_dummies['Rhythmic Gymnastics']

In [629]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9886996843704945

In [630]:
fRhythmic_Gymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRhythmic_Gymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRhythmic_Gymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRhythmic_Gymnastics_df)

array([3.12701909e-02, 5.15577337e-02, 4.00059777e-02, ...,
       2.86323501e-01, 2.86323501e-01, 1.79255230e-04])

Unnamed: 0,Age,Height,Weight,NOC,Rhythmic Gymnastics
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [631]:
fRhythmic_Gymnastics_pred = y_pred.tolist()

In [632]:
fRhythmic_Gymnastics_df['Women_Rhythmic_Gymnastics_Prediction'] = fRhythmic_Gymnastics_pred

In [633]:
fRhythmic_Gymnastics_df

Unnamed: 0,Age,Height,Weight,NOC,Rhythmic Gymnastics,Women_Rhythmic_Gymnastics_Prediction
51,1,4,6,ROU,False,3.127019e-02
69,1,5,2,NOR,False,5.155773e-02
81,2,4,2,EST,False,4.000598e-02
82,2,4,2,EST,False,4.000598e-02
101,1,3,1,AZE,False,9.907307e-01
...,...,...,...,...,...,...
204000,3,4,3,URS,False,2.453199e-08
204017,3,4,2,GRC,False,1.395553e-04
204019,1,4,2,RUS,False,2.863235e-01
204020,1,4,2,RUS,False,2.863235e-01


### Women's Rugby Sevens Regression

In [634]:
# Preserve Rugby Sevens column
fRugby_Sevens_list = female_df['Rugby Sevens'].tolist()

In [635]:
# Delete all sport columns
fRugby_Sevens_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [636]:
# Add Rugby Sevens column back in
fRugby_Sevens_df['Rugby Sevens'] = fRugby_Sevens_list

fRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [637]:
fRugby_Sevens_dummies = pd.get_dummies(fRugby_Sevens_df)
fRugby_Sevens_dummies.head(3)

Unnamed: 0,Rugby Sevens,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [638]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_train = fRugby_Sevens_dummies['Rugby Sevens']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_test = fRugby_Sevens_dummies['Rugby Sevens']

In [639]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9971359544870046

In [640]:
fRugby_Sevens_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRugby_Sevens_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRugby_Sevens_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRugby_Sevens_df)

array([0.00014288, 0.00016886, 0.00177619, ..., 0.00083802, 0.00083802,
       0.00149531])

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [641]:
fRugby_Sevens_pred = y_pred.tolist()

In [642]:
fRugby_Sevens_df['Women_Rugby_Sevens_Prediction'] = fRugby_Sevens_pred

In [643]:
fRugby_Sevens_df

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens,Women_Rugby_Sevens_Prediction
51,1,4,6,ROU,False,0.000143
69,1,5,2,NOR,False,0.000169
81,2,4,2,EST,False,0.001776
82,2,4,2,EST,False,0.001776
101,1,3,1,AZE,False,0.000089
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.005784
204017,3,4,2,GRC,False,0.001495
204019,1,4,2,RUS,False,0.000838
204020,1,4,2,RUS,False,0.000838


### Women's Trampolining Regression

In [644]:
# Preserve Trampolining column
fTrampolining_list = female_df['Trampolining'].tolist()

In [645]:
# Delete all sport columns
fTrampolining_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [646]:
# Add Trampolining column back in
fTrampolining_df['Trampolining'] = fTrampolining_list

fTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [647]:
fTrampolining_dummies = pd.get_dummies(fTrampolining_df)
fTrampolining_dummies.head(3)

Unnamed: 0,Trampolining,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [648]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTrampolining_dummies.drop(columns=['Trampolining'])
y_train = fTrampolining_dummies['Trampolining']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTrampolining_dummies.drop(columns=['Trampolining'])
y_test = fTrampolining_dummies['Trampolining']

In [649]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9985387522892881

In [650]:
fTrampolining_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTrampolining_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTrampolining_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTrampolining_df)

array([1.23970426e-05, 1.30026487e-07, 2.13798907e-04, ...,
       5.58458898e-01, 5.58458898e-01, 6.37156643e-01])

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [651]:
fTrampolining_pred = y_pred.tolist()

In [652]:
fTrampolining_df['Women_Trampolining_Prediction'] = fTrampolining_pred

In [653]:
fTrampolining_df

Unnamed: 0,Age,Height,Weight,NOC,Trampolining,Women_Trampolining_Prediction
51,1,4,6,ROU,False,1.239704e-05
69,1,5,2,NOR,False,1.300265e-07
81,2,4,2,EST,False,2.137989e-04
82,2,4,2,EST,False,2.137989e-04
101,1,3,1,AZE,False,5.433907e-03
...,...,...,...,...,...,...
204000,3,4,3,URS,False,1.043582e-06
204017,3,4,2,GRC,False,1.902248e-04
204019,1,4,2,RUS,False,5.584589e-01
204020,1,4,2,RUS,False,5.584589e-01


### Women's Beach Volleyball Regression

In [654]:
# Preserve Beach Volleyball column
fBeach_Volleyball_list = female_df['Beach Volleyball'].tolist()

In [655]:
# Delete all sport columns
fBeach_Volleyball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [656]:
# Add Beach Volleyball column back in
fBeach_Volleyball_df['Beach Volleyball'] = fBeach_Volleyball_list

fBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [657]:
fBeach_Volleyball_dummies = pd.get_dummies(fBeach_Volleyball_df)
fBeach_Volleyball_dummies.head(3)

Unnamed: 0,Beach Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [658]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_train = fBeach_Volleyball_dummies['Beach Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_test = fBeach_Volleyball_dummies['Beach Volleyball']

In [659]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9948369247554846

In [660]:
fBeach_Volleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBeach_Volleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBeach_Volleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBeach_Volleyball_df)

array([5.94451189e-07, 8.33018352e-01, 2.98915794e-03, ...,
       6.95495119e-02, 6.95495119e-02, 4.53947508e-03])

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [661]:
fBeach_Volleyball_pred = y_pred.tolist()

In [662]:
fBeach_Volleyball_df['Women_Beach_Volleyball_Prediction'] = fBeach_Volleyball_pred

In [663]:
fBeach_Volleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball,Women_Beach_Volleyball_Prediction
51,1,4,6,ROU,False,5.944512e-07
69,1,5,2,NOR,False,8.330184e-01
81,2,4,2,EST,False,2.989158e-03
82,2,4,2,EST,False,2.989158e-03
101,1,3,1,AZE,False,3.698133e-05
...,...,...,...,...,...,...
204000,3,4,3,URS,False,2.512444e-03
204017,3,4,2,GRC,False,8.384850e-01
204019,1,4,2,RUS,False,6.954951e-02
204020,1,4,2,RUS,False,6.954951e-02


### Women's Triathlon Regression

In [664]:
# Preserve Triathlon column
fTriathlon_list = female_df['Triathlon'].tolist()

In [665]:
# Delete all sport columns
fTriathlon_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [666]:
# Add Triathlon column back in
fTriathlon_df['Triathlon'] = fTriathlon_list

fTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [667]:
fTriathlon_dummies = pd.get_dummies(fTriathlon_df)
fTriathlon_dummies.head(3)

Unnamed: 0,Triathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [668]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTriathlon_dummies.drop(columns=['Triathlon'])
y_train = fTriathlon_dummies['Triathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTriathlon_dummies.drop(columns=['Triathlon'])
y_test = fTriathlon_dummies['Triathlon']

In [669]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9949148579667225

In [670]:
fTriathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTriathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTriathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTriathlon_df)

array([8.14219881e-06, 1.19238660e-03, 9.15646398e-01, ...,
       2.42882767e-01, 2.42882767e-01, 8.01159704e-03])

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [671]:
fTriathlon_pred = y_pred.tolist()

In [672]:
fTriathlon_df['Women_Triathlon_Prediction'] = fTriathlon_pred

In [673]:
fTriathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Triathlon,Women_Triathlon_Prediction
51,1,4,6,ROU,False,0.000008
69,1,5,2,NOR,False,0.001192
81,2,4,2,EST,False,0.915646
82,2,4,2,EST,False,0.915646
101,1,3,1,AZE,False,0.005969
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.000004
204017,3,4,2,GRC,False,0.339672
204019,1,4,2,RUS,False,0.242883
204020,1,4,2,RUS,False,0.242883


### Women's Golf Regression

In [674]:
# Preserve Golf column
fGolf_list = female_df['Golf'].tolist()

In [675]:
# Delete all sport columns
fGolf_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [676]:
# Add Golf column back in
fGolf_df['Golf'] = fGolf_list

fGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Golf
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [677]:
fGolf_dummies = pd.get_dummies(fGolf_df)
fGolf_dummies.head(3)

Unnamed: 0,Golf,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [678]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fGolf_dummies.drop(columns=['Golf'])
y_train = fGolf_dummies['Golf']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fGolf_dummies.drop(columns=['Golf'])
y_test = fGolf_dummies['Golf']

In [679]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.998928418345478

In [680]:
fGolf_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fGolf_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fGolf_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fGolf_df)

array([9.78299087e-05, 3.98097464e-01, 5.39261279e-03, ...,
       5.48823051e-01, 5.48823051e-01, 1.61068474e-02])

Unnamed: 0,Age,Height,Weight,NOC,Golf
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [681]:
fGolf_pred = y_pred.tolist()

In [682]:
fGolf_df['Women_Golf_Prediction'] = fGolf_pred

In [683]:
fGolf_df

Unnamed: 0,Age,Height,Weight,NOC,Golf,Women_Golf_Prediction
51,1,4,6,ROU,False,0.000098
69,1,5,2,NOR,False,0.398097
81,2,4,2,EST,False,0.005393
82,2,4,2,EST,False,0.005393
101,1,3,1,AZE,False,0.001723
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.013832
204017,3,4,2,GRC,False,0.012349
204019,1,4,2,RUS,False,0.548823
204020,1,4,2,RUS,False,0.548823


### User input Calc

In [None]:
# Turn user input into binned row
user_df["Age"] = pd.cut(male_df["Age"], mAge_bins, labels=mAge_bin_names)
user_df["Height"] = pd.cut(male_df["Height"], mHeight_bins, labels=mHeight_bin_names)
user_df["Weight"] = pd.cut(male_df["Weight"], mWeight_bins, labels=mWeight_bin_names)



In [685]:
mBasketball_dummies.columns

Index(['Basketball', 'Age_0', 'Age_1', 'Age_2', 'Age_3', 'Age_4', 'Age_5',
       'Age_6', 'Age_7', 'Age_8',
       ...
       'NOC_VGB', 'NOC_VIR', 'NOC_VNM', 'NOC_VUT', 'NOC_WSM', 'NOC_XKX',
       'NOC_YEM', 'NOC_ZAF', 'NOC_ZMB', 'NOC_ZWE'],
      dtype='object', length=240)