In [689]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
import pandas as pd
import tensorflow as tf
import numpy as np
import pickle


#  Import and read the data.csv.
import pandas as pd 
data_df = pd.read_csv("archive/data.csv")
#data_df.head()

In [690]:
#Drop unnecessary columns
data_df = data_df.drop(["Unnamed: 0","Unnamed: 0.1", "ID", "Name", "Team", "Games","Year","City", "Event", "Medal", "Host", "Nat_Pop", "Global_Pop%", "Rural_Pop%", "Life_Expect", "GDP", "GDP_Per_Cap", "Global_GDP%"], 1)

#Drop NaN values
data_df = data_df.dropna()

data_df.head(3)

Unnamed: 0,Sex,Age,Height,Weight,NOC,Season,Sport
0,M,24.0,180.0,80.0,CHN,Summer,Basketball
1,M,23.0,170.0,60.0,CHN,Summer,Judo
2,F,21.0,185.0,82.0,NLD,Winter,Speed Skating


In [691]:
#Filter out season data to just summer
data_df = data_df[data_df["Season"].str.contains("Winter") == False]

In [692]:
#Drop Season since no longer needed after filtering
data_df = data_df.drop(["Season"],1)
data_df.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport
0,M,24.0,180.0,80.0,CHN,Basketball
1,M,23.0,170.0,60.0,CHN,Judo
26,M,31.0,172.0,70.0,FIN,Badminton
29,M,31.0,189.0,130.0,FIN,Athletics
51,F,22.0,170.0,125.0,ROU,Weightlifting


In [693]:
# Create list of sports
sports = data_df['Sport'].unique()
print(sports)

['Basketball' 'Judo' 'Badminton' 'Athletics' 'Weightlifting' 'Wrestling'
 'Rowing' 'Swimming' 'Football' 'Equestrianism' 'Shooting' 'Gymnastics'
 'Taekwondo' 'Boxing' 'Fencing' 'Diving' 'Canoeing' 'Handball'
 'Water Polo' 'Tennis' 'Cycling' 'Hockey' 'Softball' 'Archery'
 'Volleyball' 'Synchronized Swimming' 'Modern Pentathlon' 'Table Tennis'
 'Baseball' 'Rhythmic Gymnastics' 'Rugby Sevens' 'Trampolining'
 'Beach Volleyball' 'Triathlon' 'Golf']


In [694]:
# create list for each sport
sport_list = {sport:[] for sport in sports}
sport_list

{'Basketball': [],
 'Judo': [],
 'Badminton': [],
 'Athletics': [],
 'Weightlifting': [],
 'Wrestling': [],
 'Rowing': [],
 'Swimming': [],
 'Football': [],
 'Equestrianism': [],
 'Shooting': [],
 'Gymnastics': [],
 'Taekwondo': [],
 'Boxing': [],
 'Fencing': [],
 'Diving': [],
 'Canoeing': [],
 'Handball': [],
 'Water Polo': [],
 'Tennis': [],
 'Cycling': [],
 'Hockey': [],
 'Softball': [],
 'Archery': [],
 'Volleyball': [],
 'Synchronized Swimming': [],
 'Modern Pentathlon': [],
 'Table Tennis': [],
 'Baseball': [],
 'Rhythmic Gymnastics': [],
 'Rugby Sevens': [],
 'Trampolining': [],
 'Beach Volleyball': [],
 'Triathlon': [],
 'Golf': []}

In [695]:
# Populate sport lists
for sport in sports:
    for value in data_df['Sport']:
        if value == sport:
            sport_list[sport].append(True),
        else:
            sport_list[sport].append(False)


In [696]:
# check length of lists
len(sport_list['Judo'])

152696

In [697]:
# check length of df to make sure list has appended
len(data_df['Sport'])

152696

In [698]:
# Add column for each sport
for sport in sports:
    data_df[sport] = sport_list[sport]


data_df.head()

Unnamed: 0,Sex,Age,Height,Weight,NOC,Sport,Basketball,Judo,Badminton,Athletics,...,Synchronized Swimming,Modern Pentathlon,Table Tennis,Baseball,Rhythmic Gymnastics,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,M,24.0,180.0,80.0,CHN,Basketball,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,M,23.0,170.0,60.0,CHN,Judo,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
26,M,31.0,172.0,70.0,FIN,Badminton,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
29,M,31.0,189.0,130.0,FIN,Athletics,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False
51,F,22.0,170.0,125.0,ROU,Weightlifting,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


In [699]:
# Drop Sport Column
data_df = data_df.drop(['Sport'],1)

In [700]:
# Seperate mens and womens sports
male_df = data_df[data_df['Sex'] == "M"]
female_df = data_df[data_df['Sex'] == "F"]

In [701]:
# Drop Sex column
male_df = male_df.drop(["Sex"],1)
female_df = female_df.drop(["Sex"],1)

In [702]:
# Drop the single sex sports from respective dataframes
male_df = male_df.drop(["Softball", "Synchronized Swimming", "Rhythmic Gymnastics"],1)
female_df = female_df.drop(["Baseball"],1)

In [703]:
male_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Archery,Volleyball,Modern Pentathlon,Table Tennis,Baseball,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,24.0,180.0,80.0,CHN,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,23.0,170.0,60.0,CHN,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
26,31.0,172.0,70.0,FIN,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
29,31.0,189.0,130.0,FIN,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
52,22.0,187.0,89.0,NOR,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [704]:
#Begin binning data to make it easier to get dummies
#Find min and max of data types to create bins
male_df['Age'] = male_df['Age'].astype(float, errors = 'raise')

mMin_age = male_df['Age'].min()
mMax_age = male_df['Age'].max()
print(mMin_age)
print(mMax_age)

#Create bins with min and max
mAge_bins = np.linspace(mMin_age,mMax_age,10)
mAge_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Age"] = pd.cut(male_df["Age"], mAge_bins, labels=mAge_bin_names)

12.0
71.0


In [705]:
#Find min and max of data types to create bins
male_df['Height'] = male_df['Height'].astype(float, errors = 'raise')

mMin_height = male_df['Height'].min()
mMax_height = male_df['Height'].max()
print(mMin_height)
print(mMax_height)

#Create bins with min and max
mHeight_bins = np.linspace(mMin_height,mMax_height,10)
mHeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Height"] = pd.cut(male_df["Height"], mHeight_bins, labels=mHeight_bin_names)

127.0
226.0


In [706]:
#Find min and max of data types to create bins
male_df['Weight'] = male_df['Weight'].astype(float, errors = 'raise')

mMin_Weight = male_df['Weight'].min()
mMax_Weight = male_df['Weight'].max()
print(mMin_Weight)
print(mMax_Weight)

#Create bins with min and max
mWeight_bins = np.linspace(mMin_Weight,mMax_Weight,10)
mWeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
male_df["Weight"] = pd.cut(male_df["Weight"], mWeight_bins, labels=mWeight_bin_names)

37.0
214.0


In [707]:
male_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Archery,Volleyball,Modern Pentathlon,Table Tennis,Baseball,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
0,1,4,2,CHN,True,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1,3,1,CHN,False,True,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
26,2,4,1,FIN,False,False,True,False,False,False,...,False,False,False,False,False,False,False,False,False,False
29,2,5,4,FIN,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
52,1,5,2,NOR,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [708]:
#Find min and max of data types to create bins
female_df['Age'] = female_df['Age'].astype(float, errors = 'raise')

fMin_Age = female_df['Age'].min()
fMax_Age = female_df['Age'].max()
print(fMin_Age)
print(fMax_Age)

#Create bins with min and max
fAge_bins = np.linspace(fMin_Age,fMax_Age,10)
fAge_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Age"] = pd.cut(female_df["Age"], fAge_bins, labels=fAge_bin_names)

11.0
69.0


In [709]:
#Find min and max of data types to create bins
female_df['Height'] = female_df['Height'].astype(float, errors = 'raise')

fMin_Height = female_df['Height'].min()
fMax_Height = female_df['Height'].max()
print(fMin_Height)
print(fMax_Height)

#Create bins with min and max
fHeight_bins = np.linspace(fMin_Height,fMax_Height,10)
fHeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Height"] = pd.cut(female_df["Height"], fHeight_bins, labels=fHeight_bin_names)

127.0
213.0


In [710]:
#Find min and max of data types to create bins
female_df['Weight'] = female_df['Weight'].astype(float, errors = 'raise')

fMin_Weight = female_df['Weight'].min()
fMax_Weight = female_df['Weight'].max()
print(fMin_Weight)
print(fMax_Weight)

#Create bins with min and max
fWeight_bins = np.linspace(fMin_Weight,fMax_Weight,10)
fWeight_bin_names = ["0","1","2","3","4","5","6","7","8"]

# Add column
female_df["Weight"] = pd.cut(female_df["Weight"], fWeight_bins, labels=fWeight_bin_names)

25.0
167.0


In [711]:
female_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Judo,Badminton,Athletics,Weightlifting,Wrestling,...,Volleyball,Synchronized Swimming,Modern Pentathlon,Table Tennis,Rhythmic Gymnastics,Rugby Sevens,Trampolining,Beach Volleyball,Triathlon,Golf
51,1,4,6,ROU,False,False,False,False,True,False,...,False,False,False,False,False,False,False,False,False,False
69,1,5,2,NOR,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False
81,2,4,2,EST,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
82,2,4,2,EST,False,False,False,True,False,False,...,False,False,False,False,False,False,False,False,False,False
101,1,3,1,AZE,False,False,False,False,False,False,...,False,False,False,False,False,False,False,False,False,False


### Men's Basketball Regression

In [712]:
# Preserve Basketball column
mBasketball_list = male_df['Basketball'].tolist()

In [713]:
# Delete all sport columns
mBasketball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [714]:
# Add basketball column back in
mBasketball_df['Basketball'] = mBasketball_list

mBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball
0,1,4,2,CHN,True
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [715]:
mBasketball_dummies = pd.get_dummies(mBasketball_df)
mBasketball_dummies.head(3)

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [716]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBasketball_dummies.drop(columns=['Basketball'])
y_train = mBasketball_dummies['Basketball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBasketball_dummies.drop(columns=['Basketball'])
y_test = mBasketball_dummies['Basketball']

In [717]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9810693499062839

In [718]:
mBasketball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBasketball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBasketball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBasketball_df)

array([0.33200395, 0.0306448 , 0.10718443, ..., 0.09492848, 0.20142717,
       0.25164207])

Unnamed: 0,Age,Height,Weight,NOC,Basketball
0,1,4,2,CHN,True
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [719]:
mBasketball_pred = y_pred.tolist()

In [720]:
mBasketball_df['Men_Basketball_Prediction'] = mBasketball_pred

In [721]:
mBasketball_df

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Men_Basketball_Prediction
0,1,4,2,CHN,True,0.332004
1,1,3,1,CHN,False,0.030645
26,2,4,1,FIN,False,0.107184
29,2,5,4,FIN,False,0.107662
52,1,5,2,NOR,False,0.002617
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.024003
204023,2,3,1,ARG,False,0.031745
204024,1,4,1,USA,False,0.094928
204025,1,5,1,RUS,False,0.201427


### Men's Judo Regression

In [34]:
# Preserve Judo column
mJudo_list = male_df['Judo'].tolist()

In [35]:
# Delete all sport columns
mJudo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [36]:
# Add Judo column back in
mJudo_df['Judo'] = mJudo_list

mJudo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Judo
0,1,4,2,CHN,False
1,1,3,1,CHN,True
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [37]:
mJudo_dummies = pd.get_dummies(mJudo_df)
mJudo_dummies.head(3)

Unnamed: 0,Judo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [38]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mJudo_dummies.drop(columns=['Judo'])
y_train = mJudo_dummies['Judo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mJudo_dummies.drop(columns=['Judo'])
y_test = mJudo_dummies['Judo']

In [39]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9764525993883792

In [40]:
mJudo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mJudo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mJudo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mJudo_df)

array([0.74966055, 0.64094174, 0.28978233, ..., 0.24287128, 0.11122837,
       0.13192836])

Unnamed: 0,Age,Height,Weight,NOC,Judo
0,1,4,2,CHN,False
1,1,3,1,CHN,True
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [41]:
mJudo_pred = y_pred.tolist()

In [42]:
mJudo_df['Men_Judo_Prediction'] = mJudo_pred

In [43]:
mJudo_df

Unnamed: 0,Age,Height,Weight,NOC,Judo,Men_Judo_Prediction
0,1,4,2,CHN,False,0.749661
1,1,3,1,CHN,True,0.640942
26,2,4,1,FIN,False,0.289782
29,2,5,4,FIN,False,0.881661
52,1,5,2,NOR,False,0.152280
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.610814
204023,2,3,1,ARG,False,0.655877
204024,1,4,1,USA,False,0.242871
204025,1,5,1,RUS,False,0.111228


### Men's Badminton Regression

In [44]:
# Preserve Badminton column
mBadminton_list = male_df['Badminton'].tolist()

In [45]:
# Delete all sport columns
mBadminton_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [46]:
# Add badminton column back in
mBadminton_df['Badminton'] = mBadminton_list

mBadminton_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Badminton
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,True
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [47]:
mBadminton_dummies = pd.get_dummies(mBadminton_df)
mBadminton_dummies.head(3)

Unnamed: 0,Badminton,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,True,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [48]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBadminton_dummies.drop(columns=['Badminton'])
y_train = mBadminton_dummies['Badminton']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBadminton_dummies.drop(columns=['Badminton'])
y_test = mBadminton_dummies['Badminton']

In [49]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9930650093716089

In [50]:
mBadminton_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBadminton_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBadminton_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBadminton_df)

array([0.8768612 , 0.69172194, 0.68833056, ..., 0.36436634, 0.46870193,
       0.64304272])

Unnamed: 0,Age,Height,Weight,NOC,Badminton
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,True
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [51]:
mBadminton_pred = y_pred.tolist()

In [52]:
mBadminton_df['Men_Badminton_Prediction'] = mBadminton_pred

In [53]:
mBadminton_df

Unnamed: 0,Age,Height,Weight,NOC,Badminton,Men_Badminton_Prediction
0,1,4,2,CHN,False,0.876861
1,1,3,1,CHN,False,0.691722
26,2,4,1,FIN,True,0.688331
29,2,5,4,FIN,False,0.003795
52,1,5,2,NOR,False,0.236207
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000595
204023,2,3,1,ARG,False,0.001214
204024,1,4,1,USA,False,0.364366
204025,1,5,1,RUS,False,0.468702


### Men's Athletics Regression

In [54]:
# Preserve Athletics column
mAthletics_list = male_df['Athletics'].tolist()

In [55]:
# Delete all sport columns
mAthletics_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [56]:
# Add Athletics column back in
mAthletics_df['Athletics'] = mAthletics_list

mAthletics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Athletics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,True
52,1,5,2,NOR,False


In [57]:
mAthletics_dummies = pd.get_dummies(mAthletics_df)
mAthletics_dummies.head(3)

Unnamed: 0,Athletics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [58]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mAthletics_dummies.drop(columns=['Athletics'])
y_train = mAthletics_dummies['Athletics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mAthletics_dummies.drop(columns=['Athletics'])
y_test = mAthletics_dummies['Athletics']

In [59]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.849422906185262

In [60]:
mAthletics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mAthletics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mAthletics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mAthletics_df)

array([0.17961898, 0.19803836, 0.70624121, ..., 0.60349916, 0.67394202,
       0.73498303])

Unnamed: 0,Age,Height,Weight,NOC,Athletics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,True
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [61]:
mAthletics_pred = y_pred.tolist()

In [62]:
mAthletics_df['Men_Athletics_Prediction'] = mAthletics_pred

In [63]:
mAthletics_df

Unnamed: 0,Age,Height,Weight,NOC,Athletics,Men_Athletics_Prediction
0,1,4,2,CHN,False,0.179619
1,1,3,1,CHN,False,0.198038
26,2,4,1,FIN,False,0.706241
29,2,5,4,FIN,True,0.925981
52,1,5,2,NOR,False,0.388592
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.109631
204023,2,3,1,ARG,False,0.141786
204024,1,4,1,USA,False,0.603499
204025,1,5,1,RUS,False,0.673942


### Men's Weightlifting Regression

In [64]:
# Preserve Weightlifting column
mWeightlifting_list = male_df['Weightlifting'].tolist()

In [65]:
# Delete all sport columns
mWeightlifting_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [66]:
# Add Weightlifting column back in
mWeightlifting_df['Weightlifting'] = mWeightlifting_list

mWeightlifting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [67]:
mWeightlifting_dummies = pd.get_dummies(mWeightlifting_df)
mWeightlifting_dummies.head(3)

Unnamed: 0,Weightlifting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [68]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWeightlifting_dummies.drop(columns=['Weightlifting'])
y_train = mWeightlifting_dummies['Weightlifting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWeightlifting_dummies.drop(columns=['Weightlifting'])
y_test = mWeightlifting_dummies['Weightlifting']

In [69]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9770444904804183

In [70]:
mWeightlifting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWeightlifting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWeightlifting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWeightlifting_df)

array([0.75148528, 0.69967567, 0.13095497, ..., 0.06393424, 0.00129261,
       0.00152277])

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [71]:
mWeightlifting_pred = y_pred.tolist()

In [72]:
mWeightlifting_df['Men_Weightlifting_Prediction'] = mWeightlifting_pred

In [73]:
mWeightlifting_df

Unnamed: 0,Age,Height,Weight,NOC,Weightlifting,Men_Weightlifting_Prediction
0,1,4,2,CHN,False,0.751485
1,1,3,1,CHN,False,0.699676
26,2,4,1,FIN,False,0.130955
29,2,5,4,FIN,False,0.851163
52,1,5,2,NOR,False,0.022350
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.148528
204023,2,3,1,ARG,False,0.170499
204024,1,4,1,USA,False,0.063934
204025,1,5,1,RUS,False,0.001293


### Men's Wrestling Regression

In [74]:
# Preserve Wrestling column
mWrestling_list = male_df['Wrestling'].tolist()

In [75]:
# Delete all sport columns
mWrestling_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [76]:
# Add Wrestling column back in
mWrestling_df['Wrestling'] = mWrestling_list

mWrestling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,True


In [77]:
mWrestling_dummies = pd.get_dummies(mWrestling_df)
mWrestling_dummies.head(3)

Unnamed: 0,Wrestling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [78]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWrestling_dummies.drop(columns=['Wrestling'])
y_train = mWrestling_dummies['Wrestling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWrestling_dummies.drop(columns=['Wrestling'])
y_test = mWrestling_dummies['Wrestling']

In [79]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9538818190786229

In [80]:
mWrestling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWrestling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWrestling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWrestling_df)

array([0.41814576, 0.44743345, 0.56839697, ..., 0.33946379, 0.09671455,
       0.12797096])

Unnamed: 0,Age,Height,Weight,NOC,Wrestling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,True
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [81]:
mWrestling_pred = y_pred.tolist()

In [82]:
mWrestling_df['Men_Wrestling_Prediction'] = mWrestling_pred

In [83]:
mWrestling_df

Unnamed: 0,Age,Height,Weight,NOC,Wrestling,Men_Wrestling_Prediction
0,1,4,2,CHN,False,0.418146
1,1,3,1,CHN,False,0.447433
26,2,4,1,FIN,False,0.568397
29,2,5,4,FIN,False,0.969242
52,1,5,2,NOR,True,0.248520
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.508650
204023,2,3,1,ARG,False,0.586583
204024,1,4,1,USA,False,0.339464
204025,1,5,1,RUS,False,0.096715


### Men's Rowing Regression

In [84]:
# Preserve Rowing column
mRowing_list = male_df['Rowing'].tolist()

In [85]:
# Delete all sport columns
mRowing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [86]:
# Add Rowing column back in
mRowing_df['Rowing'] = mRowing_list

mRowing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rowing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [87]:
mRowing_dummies = pd.get_dummies(mRowing_df)
mRowing_dummies.head(3)

Unnamed: 0,Rowing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [88]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mRowing_dummies.drop(columns=['Rowing'])
y_train = mRowing_dummies['Rowing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mRowing_dummies.drop(columns=['Rowing'])
y_test = mRowing_dummies['Rowing']

In [89]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9467495314195521

In [90]:
mRowing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mRowing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mRowing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mRowing_df)

array([0.2187973 , 0.02463639, 0.18401475, ..., 0.18809134, 0.33906142,
       0.37371803])

Unnamed: 0,Age,Height,Weight,NOC,Rowing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,True


In [91]:
mRowing_pred = y_pred.tolist()

In [92]:
mRowing_df['Men_Rowing_Prediction'] = mRowing_pred

In [93]:
mRowing_df

Unnamed: 0,Age,Height,Weight,NOC,Rowing,Men_Rowing_Prediction
0,1,4,2,CHN,False,0.218797
1,1,3,1,CHN,False,0.024636
26,2,4,1,FIN,False,0.184015
29,2,5,4,FIN,False,0.001198
52,1,5,2,NOR,False,0.833401
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.062675
204023,2,3,1,ARG,False,0.072166
204024,1,4,1,USA,False,0.188091
204025,1,5,1,RUS,True,0.339061


### Men's Swimming Regression

In [94]:
# Preserve Swimming column
mSwimming_list = male_df['Swimming'].tolist()

In [95]:
# Delete all sport columns
mSwimming_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [96]:
# Add Swimming column back in
mSwimming_df['Swimming'] = mSwimming_list

mSwimming_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Swimming
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [97]:
mSwimming_dummies = pd.get_dummies(mSwimming_df)
mSwimming_dummies.head(3)

Unnamed: 0,Swimming,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [98]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mSwimming_dummies.drop(columns=['Swimming'])
y_train = mSwimming_dummies['Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mSwimming_dummies.drop(columns=['Swimming'])
y_test = mSwimming_dummies['Swimming']

In [99]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9097267436125086

In [100]:
mSwimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mSwimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mSwimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mSwimming_df)

array([0.51271598, 0.15627438, 0.1556563 , ..., 0.52029449, 0.80761216,
       0.47921675])

Unnamed: 0,Age,Height,Weight,NOC,Swimming
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [101]:
mSwimming_pred = y_pred.tolist()

In [102]:
mSwimming_df['Men_Swimming_Prediction'] = mSwimming_pred

In [103]:
mSwimming_df

Unnamed: 0,Age,Height,Weight,NOC,Swimming,Men_Swimming_Prediction
0,1,4,2,CHN,False,0.512716
1,1,3,1,CHN,False,0.156274
26,2,4,1,FIN,False,0.155656
29,2,5,4,FIN,False,0.000438
52,1,5,2,NOR,False,0.584802
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.114570
204023,2,3,1,ARG,False,0.027582
204024,1,4,1,USA,False,0.520294
204025,1,5,1,RUS,False,0.807612


### Men's Football Regression

In [722]:
# Preserve Football column
mFootball_list = male_df['Football'].tolist()

In [723]:
# Delete all sport columns
mFootball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [724]:
# Add Football column back in
mFootball_df['Football'] = mFootball_list

mFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Football
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [725]:
mFootball_dummies = pd.get_dummies(mFootball_df)
mFootball_dummies.head(3)

Unnamed: 0,Football,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [726]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mFootball_dummies.drop(columns=['Football'])
y_train = mFootball_dummies['Football']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mFootball_dummies.drop(columns=['Football'])
y_test = mFootball_dummies['Football']

In [727]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9686495018249975

In [728]:
mFootball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mFootball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mFootball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mFootball_df)

array([0.54536879, 0.42457214, 0.28703812, ..., 0.62579532, 0.00221357,
       0.00088739])

Unnamed: 0,Age,Height,Weight,NOC,Football
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,True
204025,1,5,1,RUS,False


In [729]:
mFootball_pred = y_pred.tolist()

In [730]:
mFootball_df['Men_Football_Prediction'] = mFootball_pred

In [731]:
mFootball_df

Unnamed: 0,Age,Height,Weight,NOC,Football,Men_Football_Prediction
0,1,4,2,CHN,False,0.545369
1,1,3,1,CHN,False,0.424572
26,2,4,1,FIN,False,0.287038
29,2,5,4,FIN,False,0.000570
52,1,5,2,NOR,False,0.453188
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.753329
204023,2,3,1,ARG,False,0.550092
204024,1,4,1,USA,True,0.625795
204025,1,5,1,RUS,False,0.002214


### Men's Equestrianism Regression

In [114]:
# Preserve Equestrianism column
mEquestrianism_list = male_df['Equestrianism'].tolist()

In [115]:
# Delete all sport columns
mEquestrianism_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [116]:
# Add Equestrianism column back in
mEquestrianism_df['Equestrianism'] = mEquestrianism_list

mEquestrianism_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [117]:
mEquestrianism_dummies = pd.get_dummies(mEquestrianism_df)
mEquestrianism_dummies.head(3)

Unnamed: 0,Equestrianism,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [118]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mEquestrianism_dummies.drop(columns=['Equestrianism'])
y_train = mEquestrianism_dummies['Equestrianism']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mEquestrianism_dummies.drop(columns=['Equestrianism'])
y_test = mEquestrianism_dummies['Equestrianism']

In [119]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9712439577784354

In [120]:
mEquestrianism_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mEquestrianism_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mEquestrianism_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mEquestrianism_df)

array([0.01842739, 0.04481537, 0.02762986, ..., 0.35229395, 0.05010102,
       0.13722004])

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [121]:
mEquestrianism_pred = y_pred.tolist()

In [122]:
mEquestrianism_df['Men_Equestrianism_Prediction'] = mEquestrianism_pred

In [123]:
mEquestrianism_df

Unnamed: 0,Age,Height,Weight,NOC,Equestrianism,Men_Equestrianism_Prediction
0,1,4,2,CHN,False,0.018427
1,1,3,1,CHN,False,0.044815
26,2,4,1,FIN,False,0.027630
29,2,5,4,FIN,False,0.000017
52,1,5,2,NOR,False,0.036792
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.439509
204023,2,3,1,ARG,False,0.702783
204024,1,4,1,USA,False,0.352294
204025,1,5,1,RUS,False,0.050101


### Men's Shooting Regression

In [124]:
# Preserve Shooting column
mShooting_list = male_df['Shooting'].tolist()

In [125]:
# Delete all sport columns
mShooting_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [126]:
# Add Shooting column back in
mShooting_df['Shooting'] = mShooting_list

mShooting_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Shooting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [127]:
mShooting_dummies = pd.get_dummies(mShooting_df)
mShooting_dummies.head(3)

Unnamed: 0,Shooting,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [128]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mShooting_dummies.drop(columns=['Shooting'])
y_train = mShooting_dummies['Shooting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mShooting_dummies.drop(columns=['Shooting'])
y_test = mShooting_dummies['Shooting']

In [129]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9528756042221564

In [130]:
mShooting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mShooting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mShooting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mShooting_df)

array([0.64514323, 0.42127693, 0.61140092, ..., 0.27079761, 0.09893163,
       0.21906815])

Unnamed: 0,Age,Height,Weight,NOC,Shooting
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [131]:
mShooting_pred = y_pred.tolist()

In [132]:
mShooting_df['Men_Shooting_Prediction'] = mShooting_pred

In [133]:
mShooting_df

Unnamed: 0,Age,Height,Weight,NOC,Shooting,Men_Shooting_Prediction
0,1,4,2,CHN,False,0.645143
1,1,3,1,CHN,False,0.421277
26,2,4,1,FIN,False,0.611401
29,2,5,4,FIN,False,0.454379
52,1,5,2,NOR,False,0.454513
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.142826
204023,2,3,1,ARG,False,0.298601
204024,1,4,1,USA,False,0.270798
204025,1,5,1,RUS,False,0.098932


### Men's Gymnastics Regression

In [134]:
# Preserve Gymnastics column
mGymnastics_list = male_df['Gymnastics'].tolist()

In [135]:
# Delete all sport columns
mGymnastics_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [136]:
# Add Gymnastics column back in
mGymnastics_df['Gymnastics'] = mGymnastics_list

mGymnastics_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [137]:
mGymnastics_dummies = pd.get_dummies(mGymnastics_df)
mGymnastics_dummies.head(3)

Unnamed: 0,Gymnastics,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [138]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mGymnastics_dummies.drop(columns=['Gymnastics'])
y_train = mGymnastics_dummies['Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mGymnastics_dummies.drop(columns=['Gymnastics'])
y_test = mGymnastics_dummies['Gymnastics']

In [139]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9194534872250173

In [140]:
mGymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mGymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mGymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mGymnastics_df)

array([0.07111817, 0.90001793, 0.6215455 , ..., 0.61232522, 0.13014905,
       0.08022635])

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [141]:
mGymnastics_pred = y_pred.tolist()

In [142]:
mGymnastics_df['Men_Gymnastics_Prediction'] = mGymnastics_pred

In [143]:
mGymnastics_df

Unnamed: 0,Age,Height,Weight,NOC,Gymnastics,Men_Gymnastics_Prediction
0,1,4,2,CHN,False,0.071118
1,1,3,1,CHN,False,0.900018
26,2,4,1,FIN,False,0.621546
29,2,5,4,FIN,False,0.001905
52,1,5,2,NOR,False,0.012869
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.497501
204023,2,3,1,ARG,False,0.365950
204024,1,4,1,USA,False,0.612325
204025,1,5,1,RUS,False,0.130149


### Men's Taekwondo Regression

In [144]:
# Preserve Taekwondo column
mTaekwondo_list = male_df['Taekwondo'].tolist()

In [145]:
# Delete all sport columns
mTaekwondo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [146]:
# Add Taekwondo column back in
mTaekwondo_df['Taekwondo'] = mTaekwondo_list

mTaekwondo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [147]:
mTaekwondo_dummies = pd.get_dummies(mTaekwondo_df)
mTaekwondo_dummies.head(3)

Unnamed: 0,Taekwondo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTaekwondo_dummies.drop(columns=['Taekwondo'])
y_train = mTaekwondo_dummies['Taekwondo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTaekwondo_dummies.drop(columns=['Taekwondo'])
y_test = mTaekwondo_dummies['Taekwondo']

In [149]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9969813554306007

In [150]:
mTaekwondo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTaekwondo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTaekwondo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTaekwondo_df)

array([0.58502524, 0.47837345, 0.23021463, ..., 0.32079088, 0.65622917,
       0.70788037])

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [151]:
mTaekwondo_pred = y_pred.tolist()

In [152]:
mTaekwondo_df['Men_Taekwondo_Prediction'] = mTaekwondo_pred

In [153]:
mTaekwondo_df

Unnamed: 0,Age,Height,Weight,NOC,Taekwondo,Men_Taekwondo_Prediction
0,1,4,2,CHN,False,0.585025
1,1,3,1,CHN,False,0.478373
26,2,4,1,FIN,False,0.230215
29,2,5,4,FIN,False,0.000463
52,1,5,2,NOR,False,0.003245
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.321673
204023,2,3,1,ARG,False,0.375776
204024,1,4,1,USA,False,0.320791
204025,1,5,1,RUS,False,0.656229


### Men's Boxing Regression

In [752]:
# Preserve Boxing column
mBoxing_list = male_df['Boxing'].tolist()

In [753]:
# Delete all sport columns
mBoxing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [754]:
# Add Boxing column back in
mBoxing_df['Boxing'] = mBoxing_list

mBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Boxing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [755]:
mBoxing_dummies = pd.get_dummies(mBoxing_df)
mBoxing_dummies.head(3)

Unnamed: 0,Boxing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [756]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBoxing_dummies.drop(columns=['Boxing'])
y_train = mBoxing_dummies['Boxing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBoxing_dummies.drop(columns=['Boxing'])
y_test = mBoxing_dummies['Boxing']

In [757]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9596922166321397

In [758]:
mBoxing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBoxing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBoxing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBoxing_df)

array([0.26304039, 0.38401335, 0.40502276, ..., 0.52739876, 0.49817983,
       0.34306106])

Unnamed: 0,Age,Height,Weight,NOC,Boxing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [759]:
mBoxing_pred = y_pred.tolist()

In [760]:
mBoxing_df['Men_Boxing_Prediction'] = mBoxing_pred

In [761]:
mBoxing_df

Unnamed: 0,Age,Height,Weight,NOC,Boxing,Men_Boxing_Prediction
0,1,4,2,CHN,False,0.263040
1,1,3,1,CHN,False,0.384013
26,2,4,1,FIN,False,0.405023
29,2,5,4,FIN,False,0.056807
52,1,5,2,NOR,False,0.237948
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.640188
204023,2,3,1,ARG,False,0.483451
204024,1,4,1,USA,False,0.527399
204025,1,5,1,RUS,False,0.498180


### Men's Fencing Regression

In [164]:
# Preserve Fencing column
mFencing_list = male_df['Fencing'].tolist()

In [165]:
# Delete all sport columns
mFencing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [166]:
# Add Fencing column back in
mFencing_df['Fencing'] = mFencing_list

mFencing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Fencing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [167]:
mFencing_dummies = pd.get_dummies(mFencing_df)
mFencing_dummies.head(3)

Unnamed: 0,Fencing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [168]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mFencing_dummies.drop(columns=['Fencing'])
y_train = mFencing_dummies['Fencing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mFencing_dummies.drop(columns=['Fencing'])
y_test = mFencing_dummies['Fencing']

In [169]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9571372200848377

In [170]:
mFencing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mFencing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mFencing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mFencing_df)

array([0.59808772, 0.36640095, 0.40014436, ..., 0.53258963, 0.59108885,
       0.70441163])

Unnamed: 0,Age,Height,Weight,NOC,Fencing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [171]:
mFencing_pred = y_pred.tolist()

In [172]:
mFencing_df['Men_Fencing_Prediction'] = mFencing_pred

In [173]:
mFencing_df

Unnamed: 0,Age,Height,Weight,NOC,Fencing,Men_Fencing_Prediction
0,1,4,2,CHN,False,0.598088
1,1,3,1,CHN,False,0.366401
26,2,4,1,FIN,False,0.400144
29,2,5,4,FIN,False,0.000826
52,1,5,2,NOR,False,0.466021
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.384015
204023,2,3,1,ARG,False,0.506846
204024,1,4,1,USA,False,0.532590
204025,1,5,1,RUS,False,0.591089


### Men's Diving Regression

In [174]:
# Preserve Diving column
mDiving_list = male_df['Diving'].tolist()

In [175]:
# Delete all sport columns
mDiving_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [176]:
# Add Diving column back in
mDiving_df['Diving'] = mDiving_list

mDiving_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Diving
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [177]:
mDiving_dummies = pd.get_dummies(mDiving_df)
mDiving_dummies.head(3)

Unnamed: 0,Diving,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [178]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mDiving_dummies.drop(columns=['Diving'])
y_train = mDiving_dummies['Diving']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mDiving_dummies.drop(columns=['Diving'])
y_test = mDiving_dummies['Diving']

In [179]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9899575811384038

In [180]:
mDiving_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mDiving_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mDiving_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mDiving_df)

array([0.47884487, 0.82131476, 0.56793541, ..., 0.83731649, 0.44058885,
       0.34360591])

Unnamed: 0,Age,Height,Weight,NOC,Diving
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [181]:
mDiving_pred = y_pred.tolist()

In [182]:
mDiving_df['Men_Diving_Prediction'] = mDiving_pred

In [183]:
mDiving_df

Unnamed: 0,Age,Height,Weight,NOC,Diving,Men_Diving_Prediction
0,1,4,2,CHN,False,0.478845
1,1,3,1,CHN,False,0.821315
26,2,4,1,FIN,False,0.567935
29,2,5,4,FIN,False,0.001274
52,1,5,2,NOR,False,0.119798
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.004641
204023,2,3,1,ARG,False,0.003090
204024,1,4,1,USA,False,0.837316
204025,1,5,1,RUS,False,0.440589


### Men's Canoeing Regression

In [762]:
# Preserve Canoeing column
mCanoeing_list = male_df['Canoeing'].tolist()

In [763]:
# Delete all sport columns
mCanoeing_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [764]:
# Add Canoeing column back in
mCanoeing_df['Canoeing'] = mCanoeing_list

mCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [765]:
mCanoeing_dummies = pd.get_dummies(mCanoeing_df)
mCanoeing_dummies.head(3)

Unnamed: 0,Canoeing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [766]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mCanoeing_dummies.drop(columns=['Canoeing'])
y_train = mCanoeing_dummies['Canoeing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mCanoeing_dummies.drop(columns=['Canoeing'])
y_test = mCanoeing_dummies['Canoeing']

In [767]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9589523527670909

In [768]:
mCanoeing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mCanoeing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mCanoeing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mCanoeing_df)

array([0.66745913, 0.23773812, 0.53698938, ..., 0.42916984, 0.39895602,
       0.40517665])

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [769]:
mCanoeing_pred = y_pred.tolist()

In [770]:
mCanoeing_df['Men_Canoeing_Prediction'] = mCanoeing_pred

In [771]:
mCanoeing_df

Unnamed: 0,Age,Height,Weight,NOC,Canoeing,Men_Canoeing_Prediction
0,1,4,2,CHN,False,0.667459
1,1,3,1,CHN,False,0.237738
26,2,4,1,FIN,False,0.536989
29,2,5,4,FIN,False,0.002569
52,1,5,2,NOR,False,0.792682
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.204546
204023,2,3,1,ARG,False,0.208788
204024,1,4,1,USA,False,0.429170
204025,1,5,1,RUS,False,0.398956


### Men's Handball Regression

In [228]:
# Preserve Handball column
mHandball_list = male_df['Handball'].tolist()

In [229]:
# Delete all sport columns
mHandball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [230]:
# Add Handball column back in
mHandball_df['Handball'] = mHandball_list

mHandball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Handball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [231]:
mHandball_dummies = pd.get_dummies(mHandball_df)
mHandball_dummies.head(3)

Unnamed: 0,Handball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [232]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mHandball_dummies.drop(columns=['Handball'])
y_train = mHandball_dummies['Handball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mHandball_dummies.drop(columns=['Handball'])
y_test = mHandball_dummies['Handball']

In [233]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9800434053467495

In [234]:
mHandball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mHandball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mHandball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mHandball_df)

array([0.25973662, 0.0068595 , 0.00041036, ..., 0.06144578, 0.20495602,
       0.3207454 ])

Unnamed: 0,Age,Height,Weight,NOC,Handball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [235]:
mHandball_pred = y_pred.tolist()

In [236]:
mHandball_df['Men_Handball_Prediction'] = mHandball_pred

In [237]:
mHandball_df

Unnamed: 0,Age,Height,Weight,NOC,Handball,Men_Handball_Prediction
0,1,4,2,CHN,False,0.259737
1,1,3,1,CHN,False,0.006860
26,2,4,1,FIN,False,0.000410
29,2,5,4,FIN,False,0.000826
52,1,5,2,NOR,False,0.658584
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.014632
204023,2,3,1,ARG,False,0.026480
204024,1,4,1,USA,False,0.061446
204025,1,5,1,RUS,False,0.204956


### Men's Water Polo Regression

In [238]:
# Preserve Water Polo column
mWater_Polo_list = male_df['Water Polo'].tolist()

In [239]:
# Delete all sport columns
mWater_Polo_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [240]:
# Add Water Polo column back in
mWater_Polo_df['Water Polo'] = mWater_Polo_list

mWater_Polo_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [241]:
mWater_Polo_dummies = pd.get_dummies(mWater_Polo_df)
mWater_Polo_dummies.head(3)

Unnamed: 0,Water Polo,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [242]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mWater_Polo_dummies.drop(columns=['Water Polo'])
y_train = mWater_Polo_dummies['Water Polo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mWater_Polo_dummies.drop(columns=['Water Polo'])
y_test = mWater_Polo_dummies['Water Polo']

In [243]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9787905692019335

In [244]:
mWater_Polo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mWater_Polo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mWater_Polo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mWater_Polo_df)

array([0.61220413, 0.04579586, 0.00069347, ..., 0.29635022, 0.2517127 ,
       0.28073352])

Unnamed: 0,Age,Height,Weight,NOC,Water Polo
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [245]:
mWater_Polo_pred = y_pred.tolist()

In [246]:
mWater_Polo_df['Men_Water_Polo_Prediction'] = mWater_Polo_pred

In [247]:
mWater_Polo_df

Unnamed: 0,Age,Height,Weight,NOC,Water Polo,Men_Water_Polo_Prediction
0,1,4,2,CHN,False,0.612204
1,1,3,1,CHN,False,0.045796
26,2,4,1,FIN,False,0.000693
29,2,5,4,FIN,False,0.000592
52,1,5,2,NOR,False,0.004496
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.010380
204023,2,3,1,ARG,False,0.012024
204024,1,4,1,USA,False,0.296350
204025,1,5,1,RUS,False,0.251713


### Men's Tennis Regression

In [248]:
# Preserve Tennis column
mTennis_list = male_df['Tennis'].tolist()

In [249]:
# Delete all sport columns
mTennis_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [250]:
# Add Tennis column back in
mTennis_df['Tennis'] = mTennis_list

mTennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [251]:
mTennis_dummies = pd.get_dummies(mTennis_df)
mTennis_dummies.head(3)

Unnamed: 0,Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [252]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTennis_dummies.drop(columns=['Tennis'])
y_train = mTennis_dummies['Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTennis_dummies.drop(columns=['Tennis'])
y_test = mTennis_dummies['Tennis']

In [253]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.989868797474598

In [254]:
mTennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTennis_df)

array([0.43944216, 0.08335977, 0.1621284 , ..., 0.32590143, 0.76617355,
       0.83543334])

Unnamed: 0,Age,Height,Weight,NOC,Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [255]:
mTennis_pred = y_pred.tolist()

In [256]:
mTennis_df['Men_Tennis_Prediction'] = mTennis_pred

In [257]:
mTennis_df

Unnamed: 0,Age,Height,Weight,NOC,Tennis,Men_Tennis_Prediction
0,1,4,2,CHN,False,0.439442
1,1,3,1,CHN,False,0.083360
26,2,4,1,FIN,False,0.162128
29,2,5,4,FIN,False,0.000448
52,1,5,2,NOR,False,0.517773
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.283420
204023,2,3,1,ARG,False,0.379951
204024,1,4,1,USA,False,0.325901
204025,1,5,1,RUS,False,0.766174


### Men's Cycling Regression

In [258]:
# Preserve Cycling column
mCycling_list = male_df['Cycling'].tolist()

In [259]:
# Delete all sport columns
mCycling_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [260]:
# Add Cycling column back in
mCycling_df['Cycling'] = mCycling_list

mCycling_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Cycling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [261]:
mCycling_dummies = pd.get_dummies(mCycling_df)
mCycling_dummies.head(3)

Unnamed: 0,Cycling,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [262]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mCycling_dummies.drop(columns=['Cycling'])
y_train = mCycling_dummies['Cycling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mCycling_dummies.drop(columns=['Cycling'])
y_test = mCycling_dummies['Cycling']

In [263]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9384433264279373

In [264]:
mCycling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mCycling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mCycling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mCycling_df)

array([0.35548516, 0.31268457, 0.57315939, ..., 0.63774612, 0.55875032,
       0.51465677])

Unnamed: 0,Age,Height,Weight,NOC,Cycling
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [265]:
mCycling_pred = y_pred.tolist()

In [266]:
mCycling_df['Men_Cycling_Prediction'] = mCycling_pred

In [267]:
mCycling_df

Unnamed: 0,Age,Height,Weight,NOC,Cycling,Men_Cycling_Prediction
0,1,4,2,CHN,False,0.355485
1,1,3,1,CHN,False,0.312685
26,2,4,1,FIN,False,0.573159
29,2,5,4,FIN,False,0.001576
52,1,5,2,NOR,False,0.597102
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.600767
204023,2,3,1,ARG,False,0.557548
204024,1,4,1,USA,False,0.637746
204025,1,5,1,RUS,False,0.558750


### Men's Hockey Regression

In [268]:
# Preserve Hockey column
mHockey_list = male_df['Hockey'].tolist()

In [269]:
# Delete all sport columns
mHockey_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [270]:
# Add Hockey column back in
mHockey_df['Hockey'] = mHockey_list

mHockey_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Hockey
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [271]:
mHockey_dummies = pd.get_dummies(mHockey_df)
mHockey_dummies.head(3)

Unnamed: 0,Hockey,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [272]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mHockey_dummies.drop(columns=['Hockey'])
y_train = mHockey_dummies['Hockey']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mHockey_dummies.drop(columns=['Hockey'])
y_test = mHockey_dummies['Hockey']

In [273]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9731577389760284

In [274]:
mHockey_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mHockey_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mHockey_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mHockey_df)

array([0.26929099, 0.21896663, 0.0028166 , ..., 0.24692776, 0.00075182,
       0.00104668])

Unnamed: 0,Age,Height,Weight,NOC,Hockey
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,True
204023,2,3,1,ARG,True
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [275]:
mHockey_pred = y_pred.tolist()

In [276]:
mHockey_df['Men_Hockey_Prediction'] = mHockey_pred

In [277]:
mHockey_df

Unnamed: 0,Age,Height,Weight,NOC,Hockey,Men_Hockey_Prediction
0,1,4,2,CHN,False,0.269291
1,1,3,1,CHN,False,0.218967
26,2,4,1,FIN,False,0.002817
29,2,5,4,FIN,False,0.000004
52,1,5,2,NOR,False,0.001549
...,...,...,...,...,...,...
204022,1,3,1,ARG,True,0.830429
204023,2,3,1,ARG,True,0.872121
204024,1,4,1,USA,False,0.246928
204025,1,5,1,RUS,False,0.000752


### Men's Archery Regression

In [278]:
# Preserve Archery column
mArchery_list = male_df['Archery'].tolist()

In [279]:
# Delete all sport columns
mArchery_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [280]:
# Add Archery column back in
mArchery_df['Archery'] = mArchery_list

mArchery_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Archery
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [281]:
mArchery_dummies = pd.get_dummies(mArchery_df)
mArchery_dummies.head(3)

Unnamed: 0,Archery,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [282]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mArchery_dummies.drop(columns=['Archery'])
y_train = mArchery_dummies['Archery']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mArchery_dummies.drop(columns=['Archery'])
y_test = mArchery_dummies['Archery']

In [283]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9902633915359574

In [284]:
mArchery_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mArchery_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mArchery_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mArchery_df)

array([0.86500334, 0.66345721, 0.70368837, ..., 0.54468345, 0.36700579,
       0.34959427])

Unnamed: 0,Age,Height,Weight,NOC,Archery
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [285]:
mArchery_pred = y_pred.tolist()

In [286]:
mArchery_df['Men_Archery_Prediction'] = mArchery_pred

In [287]:
mArchery_df

Unnamed: 0,Age,Height,Weight,NOC,Archery,Men_Archery_Prediction
0,1,4,2,CHN,False,0.865003
1,1,3,1,CHN,False,0.663457
26,2,4,1,FIN,False,0.703688
29,2,5,4,FIN,False,0.527957
52,1,5,2,NOR,False,0.620559
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000561
204023,2,3,1,ARG,False,0.000520
204024,1,4,1,USA,False,0.544683
204025,1,5,1,RUS,False,0.367006


### Men's Volleyball Regression

In [288]:
# Preserve Volleyball column
mVolleyball_list = male_df['Volleyball'].tolist()

In [289]:
# Delete all sport columns
mVolleyball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [290]:
# Add Volleyball column back in
mVolleyball_df['Volleyball'] = mVolleyball_list

mVolleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [291]:
mVolleyball_dummies = pd.get_dummies(mVolleyball_df)
mVolleyball_dummies.head(3)

Unnamed: 0,Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [292]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mVolleyball_dummies.drop(columns=['Volleyball'])
y_train = mVolleyball_dummies['Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mVolleyball_dummies.drop(columns=['Volleyball'])
y_test = mVolleyball_dummies['Volleyball']

In [293]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.982874617737003

In [294]:
mVolleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mVolleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mVolleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mVolleyball_df)

array([1.25673942e-01, 3.42848426e-03, 3.77990074e-04, ...,
       1.00162240e-01, 3.90201051e-01, 4.92853036e-01])

Unnamed: 0,Age,Height,Weight,NOC,Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [295]:
mVolleyball_pred = y_pred.tolist()

In [296]:
mVolleyball_df['Men_Volleyball_Prediction'] = mVolleyball_pred

In [297]:
mVolleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Volleyball,Men_Volleyball_Prediction
0,1,4,2,CHN,False,0.125674
1,1,3,1,CHN,False,0.003428
26,2,4,1,FIN,False,0.000378
29,2,5,4,FIN,False,0.000031
52,1,5,2,NOR,False,0.002509
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.016319
204023,2,3,1,ARG,False,0.024577
204024,1,4,1,USA,False,0.100162
204025,1,5,1,RUS,False,0.390201


### Men's Modern Pentathlon Regression

In [298]:
# Preserve Modern Pentathlon column
mModern_Pentathlon_list = male_df['Modern Pentathlon'].tolist()

In [299]:
# Delete all sport columns
mModern_Pentathlon_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [300]:
# Add Modern Pentathlon column back in
mModern_Pentathlon_df['Modern Pentathlon'] = mModern_Pentathlon_list

mModern_Pentathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [301]:
mModern_Pentathlon_dummies = pd.get_dummies(mModern_Pentathlon_df)
mModern_Pentathlon_dummies.head(3)

Unnamed: 0,Modern Pentathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [302]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_train = mModern_Pentathlon_dummies['Modern Pentathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_test = mModern_Pentathlon_dummies['Modern Pentathlon']

In [303]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9897109598500543

In [304]:
mModern_Pentathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mModern_Pentathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mModern_Pentathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mModern_Pentathlon_df)

array([0.2267332 , 0.20507926, 0.87595498, ..., 0.68565517, 0.58948921,
       0.69232309])

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [305]:
mModern_Pentathlon_pred = y_pred.tolist()

In [306]:
mModern_Pentathlon_df['Men_Modern_Pentathlon_Prediction'] = mModern_Pentathlon_pred

In [307]:
mModern_Pentathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Modern Pentathlon,Men_Modern_Pentathlon_Prediction
0,1,4,2,CHN,False,0.226733
1,1,3,1,CHN,False,0.205079
26,2,4,1,FIN,False,0.875955
29,2,5,4,FIN,False,0.005956
52,1,5,2,NOR,False,0.001289
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.195268
204023,2,3,1,ARG,False,0.275481
204024,1,4,1,USA,False,0.685655
204025,1,5,1,RUS,False,0.589489


### Men's Table Tennis Regression

In [308]:
# Preserve Table Tennis column
mTable_Tennis_list = male_df['Table Tennis'].tolist()

In [309]:
# Delete all sport columns
mTable_Tennis_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [310]:
# Add Table Tennis column back in
mTable_Tennis_df['Table Tennis'] = mTable_Tennis_list

mTable_Tennis_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [311]:
mTable_Tennis_dummies = pd.get_dummies(mTable_Tennis_df)
mTable_Tennis_dummies.head(3)

Unnamed: 0,Table Tennis,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [312]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_train = mTable_Tennis_dummies['Table Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_test = mTable_Tennis_dummies['Table Tennis']

In [313]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9908552826279965

In [314]:
mTable_Tennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTable_Tennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTable_Tennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTable_Tennis_df)

array([0.73515086, 0.80470886, 0.14395564, ..., 0.46374762, 0.57716737,
       0.65091145])

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [315]:
mTable_Tennis_pred = y_pred.tolist()

In [316]:
mTable_Tennis_df['Men_Table_Tennis_Prediction'] = mTable_Tennis_pred

In [317]:
mTable_Tennis_df

Unnamed: 0,Age,Height,Weight,NOC,Table Tennis,Men_Table_Tennis_Prediction
0,1,4,2,CHN,False,0.735151
1,1,3,1,CHN,False,0.804709
26,2,4,1,FIN,False,0.143956
29,2,5,4,FIN,False,0.000177
52,1,5,2,NOR,False,0.001432
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.433155
204023,2,3,1,ARG,False,0.510724
204024,1,4,1,USA,False,0.463748
204025,1,5,1,RUS,False,0.577167


### Men's Baseball Regression

In [318]:
# Preserve Baseball column
mBaseball_list = male_df['Baseball'].tolist()

In [319]:
# Delete all sport columns
mBaseball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBaseball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [320]:
# Add Baseball column back in
mBaseball_df['Baseball'] = mBaseball_list

mBaseball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Baseball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [321]:
mBaseball_dummies = pd.get_dummies(mBaseball_df)
mBaseball_dummies.head(3)

Unnamed: 0,Baseball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [322]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBaseball_dummies.drop(columns=['Baseball'])
y_train = mBaseball_dummies['Baseball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBaseball_dummies.drop(columns=['Baseball'])
y_test = mBaseball_dummies['Baseball']

In [323]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9916543356022491

In [324]:
mBaseball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBaseball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBaseball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBaseball_df)

array([9.42784719e-01, 3.86674077e-01, 1.65170206e-04, ...,
       3.40194254e-01, 8.16667751e-05, 1.07817591e-04])

Unnamed: 0,Age,Height,Weight,NOC,Baseball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [325]:
mBaseball_pred = y_pred.tolist()

In [326]:
mBaseball_df['Men_Baseball_Prediction'] = mBaseball_pred

In [327]:
mBaseball_df

Unnamed: 0,Age,Height,Weight,NOC,Baseball,Men_Baseball_Prediction
0,1,4,2,CHN,False,0.942785
1,1,3,1,CHN,False,0.386674
26,2,4,1,FIN,False,0.000165
29,2,5,4,FIN,False,0.000061
52,1,5,2,NOR,False,0.000719
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000042
204023,2,3,1,ARG,False,0.000056
204024,1,4,1,USA,False,0.340194
204025,1,5,1,RUS,False,0.000082


### Men's Rugby Sevens Regression

In [328]:
# Preserve Rugby Sevens column
mRugby_Sevens_list = male_df['Rugby Sevens'].tolist()

In [329]:
# Delete all sport columns
mRugby_Sevens_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [330]:
# Add Rugby Sevens column back in
mRugby_Sevens_df['Rugby Sevens'] = mRugby_Sevens_list

mRugby_Sevens_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [331]:
mRugby_Sevens_dummies = pd.get_dummies(mRugby_Sevens_df)
mRugby_Sevens_dummies.head(3)

Unnamed: 0,Rugby Sevens,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [332]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_train = mRugby_Sevens_dummies['Rugby Sevens']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_test = mRugby_Sevens_dummies['Rugby Sevens']

In [333]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9985104074183684

In [334]:
mRugby_Sevens_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mRugby_Sevens_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mRugby_Sevens_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mRugby_Sevens_df)

array([7.25911462e-04, 3.55560658e-05, 3.63813835e-05, ...,
       1.32572205e-01, 1.04951258e-05, 1.51858393e-05])

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [335]:
mRugby_Sevens_pred = y_pred.tolist()

In [336]:
mRugby_Sevens_df['Men_Rugby_Sevens_Prediction'] = mRugby_Sevens_pred

In [337]:
mRugby_Sevens_df

Unnamed: 0,Age,Height,Weight,NOC,Rugby Sevens,Men_Rugby_Sevens_Prediction
0,1,4,2,CHN,False,7.259115e-04
1,1,3,1,CHN,False,3.555607e-05
26,2,4,1,FIN,False,3.638138e-05
29,2,5,4,FIN,False,8.472255e-07
52,1,5,2,NOR,False,3.031222e-04
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,5.048034e-01
204023,2,3,1,ARG,False,5.959628e-01
204024,1,4,1,USA,False,1.325722e-01
204025,1,5,1,RUS,False,1.049513e-05


### Men's Trampolining Regression

In [338]:
# Preserve Trampolining column
mTrampolining_list = male_df['Trampolining'].tolist()

In [339]:
# Delete all sport columns
mTrampolining_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [340]:
# Add Trampolining column back in
mTrampolining_df['Trampolining'] = mTrampolining_list

mTrampolining_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [341]:
mTrampolining_dummies = pd.get_dummies(mTrampolining_df)
mTrampolining_dummies.head(3)

Unnamed: 0,Trampolining,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [342]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTrampolining_dummies.drop(columns=['Trampolining'])
y_train = mTrampolining_dummies['Trampolining']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTrampolining_dummies.drop(columns=['Trampolining'])
y_test = mTrampolining_dummies['Trampolining']

In [343]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9992700009864851

In [344]:
mTrampolining_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTrampolining_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTrampolining_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTrampolining_df)

array([4.10535444e-01, 9.34655612e-01, 5.84395810e-04, ...,
       5.37788527e-01, 5.65165182e-01, 4.99206279e-01])

Unnamed: 0,Age,Height,Weight,NOC,Trampolining
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [345]:
mTrampolining_pred = y_pred.tolist()

In [346]:
mTrampolining_df['Men_Trampolining_Prediction'] = mTrampolining_pred

In [347]:
mTrampolining_df

Unnamed: 0,Age,Height,Weight,NOC,Trampolining,Men_Trampolining_Prediction
0,1,4,2,CHN,False,4.105354e-01
1,1,3,1,CHN,False,9.346556e-01
26,2,4,1,FIN,False,5.843958e-04
29,2,5,4,FIN,False,2.725333e-07
52,1,5,2,NOR,False,1.802425e-05
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,4.083484e-03
204023,2,3,1,ARG,False,3.134833e-03
204024,1,4,1,USA,False,5.377885e-01
204025,1,5,1,RUS,False,5.651652e-01


### Men's Beach Volleyball Regression

In [348]:
# Preserve Beach Volleyball column
mBeach_Volleyball_list = male_df['Beach Volleyball'].tolist()

In [349]:
# Delete all sport columns
mBeach_Volleyball_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [350]:
# Add Beach Volleyball column back in
mBeach_Volleyball_df['Beach Volleyball'] = mBeach_Volleyball_list

mBeach_Volleyball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [351]:
mBeach_Volleyball_dummies = pd.get_dummies(mBeach_Volleyball_df)
mBeach_Volleyball_dummies.head(3)

Unnamed: 0,Beach Volleyball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [352]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_train = mBeach_Volleyball_dummies['Beach Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_test = mBeach_Volleyball_dummies['Beach Volleyball']

In [353]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9973068955312222

In [354]:
mBeach_Volleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mBeach_Volleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mBeach_Volleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mBeach_Volleyball_df)

array([1.74910074e-02, 6.48770076e-06, 3.91648379e-05, ...,
       1.58518913e-03, 9.18485100e-02, 4.24728271e-01])

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [355]:
mBeach_Volleyball_pred = y_pred.tolist()

In [356]:
mBeach_Volleyball_df['Men_Beach_Volleyball_Prediction'] = mBeach_Volleyball_pred

In [357]:
mBeach_Volleyball_df

Unnamed: 0,Age,Height,Weight,NOC,Beach Volleyball,Men_Beach_Volleyball_Prediction
0,1,4,2,CHN,False,0.017491
1,1,3,1,CHN,False,0.000006
26,2,4,1,FIN,False,0.000039
29,2,5,4,FIN,False,0.000002
52,1,5,2,NOR,False,0.552503
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.000021
204023,2,3,1,ARG,False,0.000157
204024,1,4,1,USA,False,0.001585
204025,1,5,1,RUS,False,0.091849


### Men's Triathlon Regression

In [358]:
# Preserve Triathlon column
mTriathlon_list = male_df['Triathlon'].tolist()

In [359]:
# Delete all sport columns
mTriathlon_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [360]:
# Add Triathlon column back in
mTriathlon_df['Triathlon'] = mTriathlon_list

mTriathlon_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [361]:
mTriathlon_dummies = pd.get_dummies(mTriathlon_df)
mTriathlon_dummies.head(3)

Unnamed: 0,Triathlon,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [362]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mTriathlon_dummies.drop(columns=['Triathlon'])
y_train = mTriathlon_dummies['Triathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mTriathlon_dummies.drop(columns=['Triathlon'])
y_test = mTriathlon_dummies['Triathlon']

In [363]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9973858143434942

In [364]:
mTriathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mTriathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mTriathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mTriathlon_df)

array([0.02313087, 0.09055616, 0.00452947, ..., 0.51618044, 0.93498716,
       0.98220152])

Unnamed: 0,Age,Height,Weight,NOC,Triathlon
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False
...,...,...,...,...,...
204022,1,3,1,ARG,False
204023,2,3,1,ARG,False
204024,1,4,1,USA,False
204025,1,5,1,RUS,False


In [365]:
mTriathlon_pred = y_pred.tolist()

In [366]:
mTriathlon_df['Men_Triathlon_Prediction'] = mTriathlon_pred

In [367]:
mTriathlon_df

Unnamed: 0,Age,Height,Weight,NOC,Triathlon,Men_Triathlon_Prediction
0,1,4,2,CHN,False,0.023131
1,1,3,1,CHN,False,0.090556
26,2,4,1,FIN,False,0.004529
29,2,5,4,FIN,False,0.000003
52,1,5,2,NOR,False,0.000203
...,...,...,...,...,...,...
204022,1,3,1,ARG,False,0.130910
204023,2,3,1,ARG,False,0.366283
204024,1,4,1,USA,False,0.516180
204025,1,5,1,RUS,False,0.934987


### Men's Golf Regression

In [368]:
# Preserve Golf column
mGolf_list = male_df['Golf'].tolist()

In [369]:
# Delete all sport columns
mGolf_df = male_df.drop(male_df.loc[:, 'Basketball':'Golf'], axis = 1)
mGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC
0,1,4,2,CHN
1,1,3,1,CHN
26,2,4,1,FIN
29,2,5,4,FIN
52,1,5,2,NOR


In [370]:
# Add Golf column back in
mGolf_df['Golf'] = mGolf_list

mGolf_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Golf
0,1,4,2,CHN,False
1,1,3,1,CHN,False
26,2,4,1,FIN,False
29,2,5,4,FIN,False
52,1,5,2,NOR,False


In [371]:
mGolf_dummies = pd.get_dummies(mGolf_df)
mGolf_dummies.head(3)

Unnamed: 0,Golf,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
26,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [372]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = mGolf_dummies.drop(columns=['Golf'])
y_train = mGolf_dummies['Golf']

# Convert categorical data to numeric and separate target feature for testing data
x_test = mGolf_dummies.drop(columns=['Golf'])
y_test = mGolf_dummies['Golf']

In [373]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9994771628686988

In [375]:
mGolf_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

mGolf_clf.fit(x_train, np.ravel(y_train.values))
y_pred = mGolf_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,mGolf_df)

KeyboardInterrupt: 

In [None]:
mGolf_pred = y_pred.tolist()

In [None]:
mGolf_df['Men_Golf_Prediction'] = mGolf_pred

In [None]:
mGolf_df

### Women's Basketball Regression

In [666]:
# Preserve Basketball column
fBasketball_list = female_df['Basketball'].tolist()

In [667]:
# Delete all sport columns
fBasketball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [668]:
# Add basketball column back in
fBasketball_df['Basketball'] = fBasketball_list

fBasketball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Basketball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [669]:
fBasketball_dummies = pd.get_dummies(fBasketball_df)
fBasketball_dummies.head(3)

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [670]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBasketball_dummies.drop(columns=['Basketball'])
y_train = fBasketball_dummies['Basketball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBasketball_dummies.drop(columns=['Basketball'])
y_test = fBasketball_dummies['Basketball']

In [671]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9788800997545104

In [672]:
fBasketball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBasketball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBasketball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBasketball_df)

array([5.11330485e-06, 7.72467553e-03, 6.41134362e-03, ...,
       2.43757552e-01, 2.43757552e-01, 5.87880464e-01])

Unnamed: 0,Age,Height,Weight,NOC,Basketball
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [673]:
fBasketball_pred = y_pred.tolist()

In [674]:
fBasketball_df['Women_Basketball_Prediction'] = fBasketball_pred

In [675]:
fBasketball_df

Unnamed: 0,Age,Height,Weight,NOC,Basketball,Women_Basketball_Prediction
51,1,4,6,ROU,False,0.000005
69,1,5,2,NOR,False,0.007725
81,2,4,2,EST,False,0.006411
82,2,4,2,EST,False,0.006411
101,1,3,1,AZE,False,0.000180
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.625518
204017,3,4,2,GRC,False,0.433776
204019,1,4,2,RUS,False,0.243758
204020,1,4,2,RUS,False,0.243758


### Women's Judo Regression

In [None]:
# Preserve Judo column
fJudo_list = female_df['Judo'].tolist()

In [None]:
# Delete all sport columns
fJudo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fJudo_df.head()

In [None]:
# Add Judo column back in
fJudo_df['Judo'] = fJudo_list

fJudo_df.head()

In [None]:
fJudo_dummies = pd.get_dummies(fJudo_df)
fJudo_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fJudo_dummies.drop(columns=['Judo'])
y_train = fJudo_dummies['Judo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fJudo_dummies.drop(columns=['Judo'])
y_test = fJudo_dummies['Judo']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fJudo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fJudo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fJudo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fJudo_df)

In [None]:
fJudo_pred = y_pred.tolist()

In [None]:
fJudo_df['Women_Judo_Prediction'] = fJudo_pred

In [None]:
fJudo_df

### Women's Badminton Regression

In [None]:
# Preserve Badminton column
fBadminton_list = female_df['Badminton'].tolist()

In [None]:
# Delete all sport columns
fBadminton_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBadminton_df.head()

In [None]:
# Add badminton column back in
fBadminton_df['Badminton'] = fBadminton_list

fBadminton_df.head()

In [None]:
fBadminton_dummies = pd.get_dummies(fBadminton_df)
fBadminton_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBadminton_dummies.drop(columns=['Badminton'])
y_train = fBadminton_dummies['Badminton']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBadminton_dummies.drop(columns=['Badminton'])
y_test = fBadminton_dummies['Badminton']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fBadminton_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBadminton_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBadminton_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBadminton_df)

In [None]:
fBadminton_pred = y_pred.tolist()

In [None]:
fBadminton_df['Women_Badminton_Prediction'] = fBadminton_pred

In [None]:
fBadminton_df

### Women's Athletics Regression

In [None]:
# Preserve Athletics column
fAthletics_list = female_df['Athletics'].tolist()

In [None]:
# Delete all sport columns
fAthletics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fAthletics_df.head()

In [None]:
# Add Athletics column back in
fAthletics_df['Athletics'] = fAthletics_list

fAthletics_df.head()

In [None]:
fAthletics_dummies = pd.get_dummies(fAthletics_df)
fAthletics_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fAthletics_dummies.drop(columns=['Athletics'])
y_train = fAthletics_dummies['Athletics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fAthletics_dummies.drop(columns=['Athletics'])
y_test = fAthletics_dummies['Athletics']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fAthletics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fAthletics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fAthletics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fAthletics_df)

In [None]:
fAthletics_pred = y_pred.tolist()

In [None]:
fAthletics_df['Women_Athletics_Prediction'] = fAthletics_pred

In [None]:
fAthletics_df

### Women's Weightlifting Regression

In [None]:
# Preserve Weightlifting column
fWeightlifting_list = female_df['Weightlifting'].tolist()

In [None]:
# Delete all sport columns
fWeightlifting_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWeightlifting_df.head()

In [None]:
# Add Weightlifting column back in
fWeightlifting_df['Weightlifting'] = fWeightlifting_list

fWeightlifting_df.head()

In [None]:
fWeightlifting_dummies = pd.get_dummies(fWeightlifting_df)
fWeightlifting_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWeightlifting_dummies.drop(columns=['Weightlifting'])
y_train = fWeightlifting_dummies['Weightlifting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWeightlifting_dummies.drop(columns=['Weightlifting'])
y_test = fWeightlifting_dummies['Weightlifting']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fWeightlifting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWeightlifting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWeightlifting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWeightlifting_df)

In [None]:
fWeightlifting_pred = y_pred.tolist()

In [None]:
fWeightlifting_df['Women_Weightlifting_Prediction'] = fWeightlifting_pred

In [None]:
fWeightlifting_df

### Women's Wrestling Regression

In [None]:
# Preserve Wrestling column
fWrestling_list = female_df['Wrestling'].tolist()

In [None]:
# Delete all sport columns
fWrestling_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWrestling_df.head()

In [None]:
# Add Wrestling column back in
fWrestling_df['Wrestling'] = fWrestling_list

fWrestling_df.head()

In [None]:
fWrestling_dummies = pd.get_dummies(fWrestling_df)
fWrestling_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWrestling_dummies.drop(columns=['Wrestling'])
y_train = fWrestling_dummies['Wrestling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWrestling_dummies.drop(columns=['Wrestling'])
y_test = fWrestling_dummies['Wrestling']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fWrestling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWrestling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWrestling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWrestling_df)

In [None]:
fWrestling_pred = y_pred.tolist()

In [None]:
fWrestling_df['Women_Wrestling_Prediction'] = fWrestling_pred

In [None]:
fWrestling_df

### Women's Rowing Regression

In [None]:
# Preserve Rowing column
fRowing_list = female_df['Rowing'].tolist()

In [None]:
# Delete all sport columns
fRowing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRowing_df.head()

In [None]:
# Add Rowing column back in
fRowing_df['Rowing'] = fRowing_list

fRowing_df.head()

In [None]:
fRowing_dummies = pd.get_dummies(fRowing_df)
fRowing_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRowing_dummies.drop(columns=['Rowing'])
y_train = fRowing_dummies['Rowing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRowing_dummies.drop(columns=['Rowing'])
y_test = fRowing_dummies['Rowing']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fRowing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRowing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRowing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRowing_df)

In [None]:
fRowing_pred = y_pred.tolist()

In [None]:
fRowing_df['Women_Rowing_Prediction'] = fRowing_pred

In [None]:
fRowing_df

### Women's Swimming Regression

In [None]:
# Preserve Swimming column
fSwimming_list = female_df['Swimming'].tolist()

In [None]:
# Delete all sport columns
fSwimming_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSwimming_df.head()

In [None]:
# Add Swimming column back in
fSwimming_df['Swimming'] = fSwimming_list

fSwimming_df.head()

In [None]:
fSwimming_dummies = pd.get_dummies(fSwimming_df)
fSwimming_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSwimming_dummies.drop(columns=['Swimming'])
y_train = fSwimming_dummies['Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSwimming_dummies.drop(columns=['Swimming'])
y_test = fSwimming_dummies['Swimming']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fSwimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSwimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSwimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSwimming_df)

In [None]:
fSwimming_pred = y_pred.tolist()

In [None]:
fSwimming_df['Women_Swimming_Prediction'] = fSwimming_pred

In [None]:
fSwimming_df

### Women's Football Regression

In [732]:
# Preserve Football column
fFootball_list = female_df['Football'].tolist()

In [733]:
# Delete all sport columns
fFootball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [734]:
# Add Football column back in
fFootball_df['Football'] = fFootball_list

fFootball_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Football
51,1,4,6,ROU,False
69,1,5,2,NOR,True
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [735]:
fFootball_dummies = pd.get_dummies(fFootball_df)
fFootball_dummies.head(3)

Unnamed: 0,Football,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,True,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [736]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fFootball_dummies.drop(columns=['Football'])
y_train = fFootball_dummies['Football']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fFootball_dummies.drop(columns=['Football'])
y_test = fFootball_dummies['Football']

In [737]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9808284300354596

In [738]:
fFootball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fFootball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fFootball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fFootball_df)

array([8.83533980e-05, 7.90498295e-01, 3.20013972e-03, ...,
       1.73016133e-03, 1.73016133e-03, 2.85361270e-03])

Unnamed: 0,Age,Height,Weight,NOC,Football
51,1,4,6,ROU,False
69,1,5,2,NOR,True
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [739]:
fFootball_pred = y_pred.tolist()

In [740]:
fFootball_df['Women_Football_Prediction'] = fFootball_pred

In [741]:
fFootball_df

Unnamed: 0,Age,Height,Weight,NOC,Football,Women_Football_Prediction
51,1,4,6,ROU,False,0.000088
69,1,5,2,NOR,True,0.790498
81,2,4,2,EST,False,0.003200
82,2,4,2,EST,False,0.003200
101,1,3,1,AZE,False,0.001650
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.001908
204017,3,4,2,GRC,False,0.744691
204019,1,4,2,RUS,False,0.001730
204020,1,4,2,RUS,False,0.001730


### Women's Equestrianism Regression

In [None]:
# Preserve Equestrianism column
fEquestrianism_list = female_df['Equestrianism'].tolist()

In [None]:
# Delete all sport columns
fEquestrianism_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fEquestrianism_df.head()

In [None]:
# Add Equestrianism column back in
fEquestrianism_df['Equestrianism'] = fEquestrianism_list

fEquestrianism_df.head()

In [None]:
fEquestrianism_dummies = pd.get_dummies(fEquestrianism_df)
fEquestrianism_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fEquestrianism_dummies.drop(columns=['Equestrianism'])
y_train = fEquestrianism_dummies['Equestrianism']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fEquestrianism_dummies.drop(columns=['Equestrianism'])
y_test = fEquestrianism_dummies['Equestrianism']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fEquestrianism_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fEquestrianism_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fEquestrianism_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fEquestrianism_df)

In [None]:
fEquestrianism_pred = y_pred.tolist()

In [None]:
fEquestrianism_df['Women_Equestrianism_Prediction'] = fEquestrianism_pred

In [None]:
fEquestrianism_df

### Women's Shooting Regression

In [None]:
# Preserve Shooting column
fShooting_list = female_df['Shooting'].tolist()

In [None]:
# Delete all sport columns
fShooting_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fShooting_df.head()

In [None]:
# Add Shooting column back in
fShooting_df['Shooting'] = fShooting_list

fShooting_df.head()

In [None]:
fShooting_dummies = pd.get_dummies(fShooting_df)
fShooting_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fShooting_dummies.drop(columns=['Shooting'])
y_train = fShooting_dummies['Shooting']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fShooting_dummies.drop(columns=['Shooting'])
y_test = fShooting_dummies['Shooting']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fShooting_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fShooting_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fShooting_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fShooting_df)

In [None]:
fShooting_pred = y_pred.tolist()

In [None]:
fShooting_df['Women_Shooting_Prediction'] = fShooting_pred

In [None]:
fShooting_df

### Women's Gymnastics Regression

In [None]:
# Preserve Gymnastics column
fGymnastics_list = female_df['Gymnastics'].tolist()

In [None]:
# Delete all sport columns
fGymnastics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fGymnastics_df.head()

In [None]:
# Add Gymnastics column back in
fGymnastics_df['Gymnastics'] = fGymnastics_list

fGymnastics_df.head()

In [None]:
fGymnastics_dummies = pd.get_dummies(fGymnastics_df)
fGymnastics_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fGymnastics_dummies.drop(columns=['Gymnastics'])
y_train = fGymnastics_dummies['Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fGymnastics_dummies.drop(columns=['Gymnastics'])
y_test = fGymnastics_dummies['Gymnastics']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fGymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fGymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fGymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fGymnastics_df)

In [None]:
fGymnastics_pred = y_pred.tolist()

In [None]:
fGymnastics_df['Women_Gymnastics_Prediction'] = fGymnastics_pred

In [None]:
fGymnastics_df

### Women's Taekwondo Regression

In [None]:
# Preserve Taekwondo column
fTaekwondo_list = female_df['Taekwondo'].tolist()

In [None]:
# Delete all sport columns
fTaekwondo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTaekwondo_df.head()

In [None]:
# Add Taekwondo column back in
fTaekwondo_df['Taekwondo'] = fTaekwondo_list

fTaekwondo_df.head()

In [None]:
fTaekwondo_dummies = pd.get_dummies(fTaekwondo_df)
fTaekwondo_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTaekwondo_dummies.drop(columns=['Taekwondo'])
y_train = fTaekwondo_dummies['Taekwondo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTaekwondo_dummies.drop(columns=['Taekwondo'])
y_test = fTaekwondo_dummies['Taekwondo']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fTaekwondo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTaekwondo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTaekwondo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTaekwondo_df)

In [None]:
fTaekwondo_pred = y_pred.tolist()

In [None]:
fTaekwondo_df['Women_Taekwondo_Prediction'] = fTaekwondo_pred

In [None]:
fTaekwondo_df

### Women's Boxing Regression

In [742]:
# Preserve Boxing column
fBoxing_list = female_df['Boxing'].tolist()

In [743]:
# Delete all sport columns
fBoxing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [744]:
# Add Boxing column back in
fBoxing_df['Boxing'] = fBoxing_list

fBoxing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Boxing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [745]:
fBoxing_dummies = pd.get_dummies(fBoxing_df)
fBoxing_dummies.head(3)

Unnamed: 0,Boxing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [746]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBoxing_dummies.drop(columns=['Boxing'])
y_train = fBoxing_dummies['Boxing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBoxing_dummies.drop(columns=['Boxing'])
y_test = fBoxing_dummies['Boxing']

In [747]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

0.9988504851342399

In [748]:
fBoxing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBoxing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBoxing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBoxing_df)

array([1.56964746e-05, 2.09670123e-04, 1.93100295e-03, ...,
       4.67804849e-01, 4.67804849e-01, 2.17969632e-03])

Unnamed: 0,Age,Height,Weight,NOC,Boxing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [749]:
fBoxing_pred = y_pred.tolist()

In [750]:
fBoxing_df['Women_Boxing_Prediction'] = fBoxing_pred

In [751]:
fBoxing_df

Unnamed: 0,Age,Height,Weight,NOC,Boxing,Women_Boxing_Prediction
51,1,4,6,ROU,False,0.000016
69,1,5,2,NOR,False,0.000210
81,2,4,2,EST,False,0.001931
82,2,4,2,EST,False,0.001931
101,1,3,1,AZE,False,0.915797
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.023992
204017,3,4,2,GRC,False,0.002022
204019,1,4,2,RUS,False,0.467805
204020,1,4,2,RUS,False,0.467805


### Women's Fencing Regression

In [None]:
# Preserve Fencing column
fFencing_list = female_df['Fencing'].tolist()

In [None]:
# Delete all sport columns
fFencing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fFencing_df.head()

In [None]:
# Add Fencing column back in
fFencing_df['Fencing'] = fFencing_list

fFencing_df.head()

In [None]:
fFencing_dummies = pd.get_dummies(fFencing_df)
fFencing_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fFencing_dummies.drop(columns=['Fencing'])
y_train = fFencing_dummies['Fencing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fFencing_dummies.drop(columns=['Fencing'])
y_test = fFencing_dummies['Fencing']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fFencing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fFencing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fFencing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fFencing_df)

In [None]:
fFencing_pred = y_pred.tolist()

In [None]:
fFencing_df['Women_Fencing_Prediction'] = fFencing_pred

In [None]:
fFencing_df

### Women's Diving Regression

In [None]:
# Preserve Diving column
fDiving_list = female_df['Diving'].tolist()

In [None]:
# Delete all sport columns
fDiving_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fDiving_df.head()

In [None]:
# Add Diving column back in
fDiving_df['Diving'] = fDiving_list

fDiving_df.head()

In [None]:
fDiving_dummies = pd.get_dummies(fDiving_df)
fDiving_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fDiving_dummies.drop(columns=['Diving'])
y_train = fDiving_dummies['Diving']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fDiving_dummies.drop(columns=['Diving'])
y_test = fDiving_dummies['Diving']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fDiving_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fDiving_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fDiving_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fDiving_df)

In [None]:
fDiving_pred = y_pred.tolist()

In [None]:
fDiving_df['Women_Diving_Prediction'] = fDiving_pred

In [None]:
fDiving_df

### Women's Canoeing Regression

In [772]:
# Preserve Canoeing column
fCanoeing_list = female_df['Canoeing'].tolist()

In [773]:
# Delete all sport columns
fCanoeing_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC
51,1,4,6,ROU
69,1,5,2,NOR
81,2,4,2,EST
82,2,4,2,EST
101,1,3,1,AZE


In [774]:
# Add Canoeing column back in
fCanoeing_df['Canoeing'] = fCanoeing_list

fCanoeing_df.head()

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False


In [775]:
fCanoeing_dummies = pd.get_dummies(fCanoeing_df)
fCanoeing_dummies.head(3)

Unnamed: 0,Canoeing,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
51,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
69,False,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
81,False,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [776]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fCanoeing_dummies.drop(columns=['Canoeing'])
y_train = fCanoeing_dummies['Canoeing']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fCanoeing_dummies.drop(columns=['Canoeing'])
y_test = fCanoeing_dummies['Canoeing']

In [777]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.9752367221291354

In [778]:
fCanoeing_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fCanoeing_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fCanoeing_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fCanoeing_df)

array([0.10937658, 0.52774439, 0.05301833, ..., 0.58957656, 0.58957656,
       0.83531217])

Unnamed: 0,Age,Height,Weight,NOC,Canoeing
51,1,4,6,ROU,False
69,1,5,2,NOR,False
81,2,4,2,EST,False
82,2,4,2,EST,False
101,1,3,1,AZE,False
...,...,...,...,...,...
204000,3,4,3,URS,False
204017,3,4,2,GRC,False
204019,1,4,2,RUS,False
204020,1,4,2,RUS,False


In [779]:
fCanoeing_pred = y_pred.tolist()

In [780]:
fCanoeing_df['Women_Canoeing_Prediction'] = fCanoeing_pred

In [781]:
fCanoeing_df

Unnamed: 0,Age,Height,Weight,NOC,Canoeing,Women_Canoeing_Prediction
51,1,4,6,ROU,False,0.109377
69,1,5,2,NOR,False,0.527744
81,2,4,2,EST,False,0.053018
82,2,4,2,EST,False,0.053018
101,1,3,1,AZE,False,0.186094
...,...,...,...,...,...,...
204000,3,4,3,URS,False,0.526201
204017,3,4,2,GRC,False,0.361264
204019,1,4,2,RUS,False,0.589577
204020,1,4,2,RUS,False,0.589577


### Women's Handball Regression

In [None]:
# Preserve Handball column
fHandball_list = female_df['Handball'].tolist()

In [None]:
# Delete all sport columns
fHandball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fHandball_df.head()

In [None]:
# Add Handball column back in
fHandball_df['Handball'] = fHandball_list

fHandball_df.head()

In [None]:
fHandball_dummies = pd.get_dummies(fHandball_df)
fHandball_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fHandball_dummies.drop(columns=['Handball'])
y_train = fHandball_dummies['Handball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fHandball_dummies.drop(columns=['Handball'])
y_test = fHandball_dummies['Handball']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fHandball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fHandball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fHandball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fHandball_df)

In [None]:
fHandball_pred = y_pred.tolist()

In [None]:
fHandball_df['Women_Handball_Prediction'] = fHandball_pred

In [None]:
fHandball_df

### Women's Water Polo Regression

In [None]:
# Preserve Water Polo column
fWater_Polo_list = female_df['Water Polo'].tolist()

In [None]:
# Delete all sport columns
fWater_Polo_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fWater_Polo_df.head()

In [None]:
# Add Water Polo column back in
fWater_Polo_df['Water Polo'] = fWater_Polo_list

fWater_Polo_df.head()

In [None]:
fWater_Polo_dummies = pd.get_dummies(fWater_Polo_df)
fWater_Polo_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fWater_Polo_dummies.drop(columns=['Water Polo'])
y_train = fWater_Polo_dummies['Water Polo']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fWater_Polo_dummies.drop(columns=['Water Polo'])
y_test = fWater_Polo_dummies['Water Polo']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fWater_Polo_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fWater_Polo_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fWater_Polo_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fWater_Polo_df)

In [None]:
fWater_Polo_pred = y_pred.tolist()

In [None]:
fWater_Polo_df['Women_Water_Polo_Prediction'] = fWater_Polo_pred

In [None]:
fWater_Polo_df

### Women's Tennis Regression

In [None]:
# Preserve Tennis column
fTennis_list = female_df['Tennis'].tolist()

In [None]:
# Delete all sport columns
fTennis_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTennis_df.head()

In [None]:
# Add Tennis column back in
fTennis_df['Tennis'] = fTennis_list

fTennis_df.head()

In [None]:
fTennis_dummies = pd.get_dummies(fTennis_df)
fTennis_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTennis_dummies.drop(columns=['Tennis'])
y_train = fTennis_dummies['Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTennis_dummies.drop(columns=['Tennis'])
y_test = fTennis_dummies['Tennis']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fTennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTennis_df)

In [None]:
fTennis_pred = y_pred.tolist()

In [None]:
fTennis_df['Women_Tennis_Prediction'] = fTennis_pred

In [None]:
fTennis_df

### Women's Cycling Regression

In [None]:
# Preserve Cycling column
fCycling_list = female_df['Cycling'].tolist()

In [None]:
# Delete all sport columns
fCycling_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fCycling_df.head()

In [None]:
# Add Cycling column back in
fCycling_df['Cycling'] = fCycling_list

fCycling_df.head()

In [None]:
fCycling_dummies = pd.get_dummies(fCycling_df)
fCycling_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fCycling_dummies.drop(columns=['Cycling'])
y_train = fCycling_dummies['Cycling']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fCycling_dummies.drop(columns=['Cycling'])
y_test = fCycling_dummies['Cycling']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fCycling_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fCycling_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fCycling_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fCycling_df)

In [None]:
fCycling_pred = y_pred.tolist()

In [None]:
fCycling_df['Women_Cycling_Prediction'] = fCycling_pred

In [None]:
fCycling_df

### Women's Hockey Regression

In [None]:
# Preserve Hockey column
fHockey_list = female_df['Hockey'].tolist()

In [None]:
# Delete all sport columns
fHockey_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fHockey_df.head()

In [None]:
# Add Hockey column back in
fHockey_df['Hockey'] = fHockey_list

fHockey_df.head()

In [None]:
fHockey_dummies = pd.get_dummies(fHockey_df)
fHockey_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fHockey_dummies.drop(columns=['Hockey'])
y_train = fHockey_dummies['Hockey']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fHockey_dummies.drop(columns=['Hockey'])
y_test = fHockey_dummies['Hockey']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fHockey_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fHockey_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fHockey_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fHockey_df)

In [None]:
fHockey_pred = y_pred.tolist()

In [None]:
fHockey_df['Women_Hockey_Prediction'] = fHockey_pred

In [None]:
fHockey_df

### Women's Archery Regression

In [None]:
# Preserve Archery column
fArchery_list = female_df['Archery'].tolist()

In [None]:
# Delete all sport columns
fArchery_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fArchery_df.head()

In [None]:
# Add Archery column back in
fArchery_df['Archery'] = fArchery_list

fArchery_df.head()

In [None]:
fArchery_dummies = pd.get_dummies(fArchery_df)
fArchery_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fArchery_dummies.drop(columns=['Archery'])
y_train = fArchery_dummies['Archery']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fArchery_dummies.drop(columns=['Archery'])
y_test = fArchery_dummies['Archery']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fArchery_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fArchery_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fArchery_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fArchery_df)

In [None]:
fArchery_pred = y_pred.tolist()

In [None]:
fArchery_df['Women_Archery_Prediction'] = fArchery_pred

In [None]:
fArchery_df

### Women's Softball Regression

In [None]:
# Preserve Softball column
fSoftball_list = female_df['Softball'].tolist()

In [None]:
# Delete all sport columns
fSoftball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSoftball_df.head()

In [None]:
# Add Softball column back in
fSoftball_df['Softball'] = fSoftball_list

fSoftball_df.head()

In [None]:
fSoftball_dummies = pd.get_dummies(fSoftball_df)
fSoftball_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSoftball_dummies.drop(columns=['Softball'])
y_train = fSoftball_dummies['Softball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSoftball_dummies.drop(columns=['Softball'])
y_test = fSoftball_dummies['Softball']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fSoftball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSoftball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSoftball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSoftball_df)

In [None]:
fSoftball_pred = y_pred.tolist()

In [None]:
fSoftball_df['Women_Softball_Prediction'] = fSoftball_pred

In [None]:
fSoftball_df

### Women's Volleyball Regression

In [None]:
# Preserve Volleyball column
fVolleyball_list = female_df['Volleyball'].tolist()

In [None]:
# Delete all sport columns
fVolleyball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fVolleyball_df.head()

In [None]:
# Add Volleyball column back in
fVolleyball_df['Volleyball'] = fVolleyball_list

fVolleyball_df.head()

In [None]:
fVolleyball_dummies = pd.get_dummies(fVolleyball_df)
fVolleyball_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fVolleyball_dummies.drop(columns=['Volleyball'])
y_train = fVolleyball_dummies['Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fVolleyball_dummies.drop(columns=['Volleyball'])
y_test = fVolleyball_dummies['Volleyball']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fVolleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fVolleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fVolleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fVolleyball_df)

In [None]:
fVolleyball_pred = y_pred.tolist()

In [None]:
fVolleyball_df['Women_Volleyball_Prediction'] = fVolleyball_pred

In [None]:
fVolleyball_df

### Women's Modern Pentathlon Regression

In [None]:
# Preserve Modern Pentathlon column
fModern_Pentathlon_list = female_df['Modern Pentathlon'].tolist()

In [None]:
# Delete all sport columns
fModern_Pentathlon_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fModern_Pentathlon_df.head()

In [None]:
# Add Modern Pentathlon column back in
fModern_Pentathlon_df['Modern Pentathlon'] = fModern_Pentathlon_list

fModern_Pentathlon_df.head()

In [None]:
fModern_Pentathlon_dummies = pd.get_dummies(fModern_Pentathlon_df)
fModern_Pentathlon_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_train = fModern_Pentathlon_dummies['Modern Pentathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fModern_Pentathlon_dummies.drop(columns=['Modern Pentathlon'])
y_test = fModern_Pentathlon_dummies['Modern Pentathlon']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fModern_Pentathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fModern_Pentathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fModern_Pentathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fModern_Pentathlon_df)

In [None]:
fModern_Pentathlon_pred = y_pred.tolist()

In [None]:
fModern_Pentathlon_df['Women_Modern_Pentathlon_Prediction'] = fModern_Pentathlon_pred

In [None]:
fModern_Pentathlon_df

### Women's Table Tennis Regression

In [None]:
# Preserve Table Tennis column
fTable_Tennis_list = female_df['Table Tennis'].tolist()

In [None]:
# Delete all sport columns
fTable_Tennis_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTable_Tennis_df.head()

In [None]:
# Add Table Tennis column back in
fTable_Tennis_df['Table Tennis'] = fTable_Tennis_list

fTable_Tennis_df.head()

In [None]:
fTable_Tennis_dummies = pd.get_dummies(fTable_Tennis_df)
fTable_Tennis_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_train = fTable_Tennis_dummies['Table Tennis']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTable_Tennis_dummies.drop(columns=['Table Tennis'])
y_test = fTable_Tennis_dummies['Table Tennis']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fTable_Tennis_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTable_Tennis_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTable_Tennis_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTable_Tennis_df)

In [None]:
fTable_Tennis_pred = y_pred.tolist()

In [None]:
fTable_Tennis_df['Women_Table_Tennis_Prediction'] = fTable_Tennis_pred

In [None]:
fTable_Tennis_df

### Women's Synchronized Swimming Regression

In [None]:
# Preserve Synchronized Swimming column
fSynchronized_Swimming_list = female_df['Synchronized Swimming'].tolist()

In [None]:
# Delete all sport columns
fSynchronized_Swimming_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fSynchronized_Swimming_df.head()

In [None]:
# Add Synchronized Swimming column back in
fSynchronized_Swimming_df['Synchronized Swimming'] = fSynchronized_Swimming_list

fSynchronized_Swimming_df.head()

In [None]:
fSynchronized_Swimming_dummies = pd.get_dummies(fSynchronized_Swimming_df)
fSynchronized_Swimming_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fSynchronized_Swimming_dummies.drop(columns=['Synchronized Swimming'])
y_train = fSynchronized_Swimming_dummies['Synchronized Swimming']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fSynchronized_Swimming_dummies.drop(columns=['Synchronized Swimming'])
y_test = fSynchronized_Swimming_dummies['Synchronized Swimming']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fSynchronized_Swimming_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fSynchronized_Swimming_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fSynchronized_Swimming_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fSynchronized_Swimming_df)

In [None]:
fSynchronized_Swimming_pred = y_pred.tolist()

In [None]:
fSynchronized_Swimming_df['Women_Synchronized_Swimming_Prediction'] = fSynchronized_Swimming_pred

In [None]:
fSynchronized_Swimming_df

### Women's Rhythmic Gymnastics Regression

In [None]:
# Preserve Rhythmic Gymnastics column
fRhythmic_Gymnastics_list = female_df['Rhythmic Gymnastics'].tolist()

In [None]:
# Delete all sport columns
fRhythmic_Gymnastics_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRhythmic_Gymnastics_df.head()

In [None]:
# Add Rhythmic Gymnastics column back in
fRhythmic_Gymnastics_df['Rhythmic Gymnastics'] = fRhythmic_Gymnastics_list

fRhythmic_Gymnastics_df.head()

In [None]:
fRhythmic_Gymnastics_dummies = pd.get_dummies(fRhythmic_Gymnastics_df)
fRhythmic_Gymnastics_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRhythmic_Gymnastics_dummies.drop(columns=['Rhythmic Gymnastics'])
y_train = fRhythmic_Gymnastics_dummies['Rhythmic Gymnastics']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRhythmic_Gymnastics_dummies.drop(columns=['Rhythmic Gymnastics'])
y_test = fRhythmic_Gymnastics_dummies['Rhythmic Gymnastics']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fRhythmic_Gymnastics_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRhythmic_Gymnastics_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRhythmic_Gymnastics_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRhythmic_Gymnastics_df)

In [None]:
fRhythmic_Gymnastics_pred = y_pred.tolist()

In [None]:
fRhythmic_Gymnastics_df['Women_Rhythmic_Gymnastics_Prediction'] = fRhythmic_Gymnastics_pred

In [None]:
fRhythmic_Gymnastics_df

### Women's Rugby Sevens Regression

In [None]:
# Preserve Rugby Sevens column
fRugby_Sevens_list = female_df['Rugby Sevens'].tolist()

In [None]:
# Delete all sport columns
fRugby_Sevens_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fRugby_Sevens_df.head()

In [None]:
# Add Rugby Sevens column back in
fRugby_Sevens_df['Rugby Sevens'] = fRugby_Sevens_list

fRugby_Sevens_df.head()

In [None]:
fRugby_Sevens_dummies = pd.get_dummies(fRugby_Sevens_df)
fRugby_Sevens_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_train = fRugby_Sevens_dummies['Rugby Sevens']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fRugby_Sevens_dummies.drop(columns=['Rugby Sevens'])
y_test = fRugby_Sevens_dummies['Rugby Sevens']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fRugby_Sevens_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fRugby_Sevens_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fRugby_Sevens_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fRugby_Sevens_df)

In [None]:
fRugby_Sevens_pred = y_pred.tolist()

In [None]:
fRugby_Sevens_df['Women_Rugby_Sevens_Prediction'] = fRugby_Sevens_pred

In [None]:
fRugby_Sevens_df

### Women's Trampolining Regression

In [None]:
# Preserve Trampolining column
fTrampolining_list = female_df['Trampolining'].tolist()

In [None]:
# Delete all sport columns
fTrampolining_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTrampolining_df.head()

In [None]:
# Add Trampolining column back in
fTrampolining_df['Trampolining'] = fTrampolining_list

fTrampolining_df.head()

In [None]:
fTrampolining_dummies = pd.get_dummies(fTrampolining_df)
fTrampolining_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTrampolining_dummies.drop(columns=['Trampolining'])
y_train = fTrampolining_dummies['Trampolining']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTrampolining_dummies.drop(columns=['Trampolining'])
y_test = fTrampolining_dummies['Trampolining']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fTrampolining_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTrampolining_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTrampolining_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTrampolining_df)

In [None]:
fTrampolining_pred = y_pred.tolist()

In [None]:
fTrampolining_df['Women_Trampolining_Prediction'] = fTrampolining_pred

In [None]:
fTrampolining_df

### Women's Beach Volleyball Regression

In [None]:
# Preserve Beach Volleyball column
fBeach_Volleyball_list = female_df['Beach Volleyball'].tolist()

In [None]:
# Delete all sport columns
fBeach_Volleyball_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fBeach_Volleyball_df.head()

In [None]:
# Add Beach Volleyball column back in
fBeach_Volleyball_df['Beach Volleyball'] = fBeach_Volleyball_list

fBeach_Volleyball_df.head()

In [None]:
fBeach_Volleyball_dummies = pd.get_dummies(fBeach_Volleyball_df)
fBeach_Volleyball_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_train = fBeach_Volleyball_dummies['Beach Volleyball']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fBeach_Volleyball_dummies.drop(columns=['Beach Volleyball'])
y_test = fBeach_Volleyball_dummies['Beach Volleyball']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fBeach_Volleyball_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fBeach_Volleyball_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fBeach_Volleyball_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fBeach_Volleyball_df)

In [None]:
fBeach_Volleyball_pred = y_pred.tolist()

In [None]:
fBeach_Volleyball_df['Women_Beach_Volleyball_Prediction'] = fBeach_Volleyball_pred

In [None]:
fBeach_Volleyball_df

### Women's Triathlon Regression

In [None]:
# Preserve Triathlon column
fTriathlon_list = female_df['Triathlon'].tolist()

In [None]:
# Delete all sport columns
fTriathlon_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fTriathlon_df.head()

In [None]:
# Add Triathlon column back in
fTriathlon_df['Triathlon'] = fTriathlon_list

fTriathlon_df.head()

In [None]:
fTriathlon_dummies = pd.get_dummies(fTriathlon_df)
fTriathlon_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fTriathlon_dummies.drop(columns=['Triathlon'])
y_train = fTriathlon_dummies['Triathlon']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fTriathlon_dummies.drop(columns=['Triathlon'])
y_test = fTriathlon_dummies['Triathlon']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fTriathlon_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fTriathlon_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fTriathlon_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fTriathlon_df)

In [None]:
fTriathlon_pred = y_pred.tolist()

In [None]:
fTriathlon_df['Women_Triathlon_Prediction'] = fTriathlon_pred

In [None]:
fTriathlon_df

### Women's Golf Regression

In [None]:
# Preserve Golf column
fGolf_list = female_df['Golf'].tolist()

In [None]:
# Delete all sport columns
fGolf_df = female_df.drop(female_df.loc[:, 'Basketball':'Golf'], axis = 1)
fGolf_df.head()

In [None]:
# Add Golf column back in
fGolf_df['Golf'] = fGolf_list

fGolf_df.head()

In [None]:
fGolf_dummies = pd.get_dummies(fGolf_df)
fGolf_dummies.head(3)

In [None]:
from sklearn.model_selection import train_test_split

# Convert categorical data to numeric and separate target feature for training data
x_train = fGolf_dummies.drop(columns=['Golf'])
y_train = fGolf_dummies['Golf']

# Convert categorical data to numeric and separate target feature for testing data
x_test = fGolf_dummies.drop(columns=['Golf'])
y_test = fGolf_dummies['Golf']

In [None]:
# Train the Logistic Regression model on the unscaled data and print the model score
classifier = LogisticRegression()
classifier.fit(x_train, y_train)
classifier.score(x_test, y_test)

In [None]:
fGolf_clf = LogisticRegression(penalty='l1', dual=False, tol=0.001, C=1.0, fit_intercept=True, 
                   intercept_scaling=1, class_weight='balanced', random_state=None, 
                   solver='liblinear', max_iter=1000, multi_class='ovr', verbose=0)

fGolf_clf.fit(x_train, np.ravel(y_train.values))
y_pred = fGolf_clf.predict_proba(x_test)
y_pred = y_pred[:,1]

display(y_pred,fGolf_df)

In [None]:
fGolf_pred = y_pred.tolist()

In [None]:
fGolf_df['Women_Golf_Prediction'] = fGolf_pred

In [None]:
fGolf_df

### Export the Linear Regression Models

In [None]:
pickle.dump(mBasketball_clf, open('H5/mBasketball.pkl','wb'))
pickle.dump(mJudo_clf, open('H5/mJudo.pkl','wb'))
pickle.dump(mBadminton_clf, open('H5/mBadminton.pkl','wb'))
pickle.dump(mAthletics_clf, open('H5/mAthletics.pkl','wb'))
pickle.dump(mWeightlifting_clf, open('H5/mWeightlifting.pkl','wb'))
pickle.dump(mWrestling_clf, open('H5/mWrestling.pkl','wb'))
pickle.dump(mRowing_clf, open('H5/mRowing.pkl','wb'))
pickle.dump(mSwimming_clf, open('H5/mSwimming.pkl','wb'))
pickle.dump(mFootball_clf, open('H5/mFootball.pkl','wb'))
pickle.dump(mEquestrianism_clf, open('H5/mEquestrianism.pkl','wb'))
pickle.dump(mShooting_clf, open('H5/mShooting.pkl','wb'))
pickle.dump(mGymnastics_clf, open('H5/mGymnastics.pkl','wb'))
pickle.dump(mTaekwondo_clf, open('H5/mTaekwondo.pkl','wb'))
pickle.dump(mFencing_clf, open('H5/mFencing.pkl','wb'))
pickle.dump(mDiving_clf, open('H5/mDiving.pkl','wb'))
pickle.dump(mHandball_clf, open('H5/mHandball.pkl','wb'))
pickle.dump(mWater_Polo_clf, open('H5/mWater_Polo.pkl','wb'))
pickle.dump(mTennis_clf, open('H5/mTennis.pkl','wb'))
pickle.dump(mCycling_clf, open('H5/mCycling.pkl','wb'))
pickle.dump(mHockey_clf, open('H5/mHockey.pkl','wb'))
pickle.dump(mArchery_clf, open('H5/mArchery.pkl','wb'))
pickle.dump(mVolleyball_clf, open('H5/mVolleyball.pkl','wb'))
pickle.dump(mModern_Pentathlon_clf, open('H5/mModern_Pentathlon.pkl','wb'))
pickle.dump(mTable_Tennis_clf, open('H5/mTable_Tennis.pkl','wb'))
pickle.dump(mBaseball_clf, open('H5/mBaseball.pkl','wb'))
pickle.dump(mRugby_Sevens_clf, open('H5/mRugby_Sevens.pkl','wb'))
pickle.dump(mTrampolining_clf, open('H5/mTrampolining.pkl','wb'))
pickle.dump(mBeach_Volleyball_clf, open('H5/mBeach_Volleyball.pkl','wb'))
pickle.dump(mTriathlon_clf, open('H5/mTriathlon.pkl','wb'))
pickle.dump(mGolf_clf, open('H5/mGolf.pkl','wb'))

In [None]:
pickle.dump(fBasketball_clf, open('H5/fBasketball.pkl','wb'))
pickle.dump(fJudo_clf, open('H5/fJudo.pkl','wb'))
pickle.dump(fBadminton_clf, open('H5/fBadminton.pkl','wb'))
pickle.dump(fAthletics_clf, open('H5/fAthletics.pkl','wb'))
pickle.dump(fWeightlifting_clf, open('H5/fWeightlifting.pkl','wb'))
pickle.dump(fWrestling_clf, open('H5/fWrestling.pkl','wb'))
pickle.dump(fRowing_clf, open('H5/fRowing.pkl','wb'))
pickle.dump(fSwimming_clf, open('H5/fSwimming.pkl','wb'))
pickle.dump(fFootball_clf, open('H5/fFootball.pkl','wb'))
pickle.dump(fEquestrianism_clf, open('H5/fEquestrianism.pkl','wb'))
pickle.dump(fShooting_clf, open('H5/fShooting.pkl','wb'))
pickle.dump(fGymnastics_clf, open('H5/fGymnastics.pkl','wb'))
pickle.dump(fTaekwondo_clf, open('H5/fTaekwondo.pkl','wb'))
pickle.dump(fFencing_clf, open('H5/fFencing.pkl','wb'))
pickle.dump(fDiving_clf, open('H5/fDiving.pkl','wb'))
pickle.dump(fHandball_clf, open('H5/fHandball.pkl','wb'))
pickle.dump(fWater_Polo_clf, open('H5/fWater_Polo.pkl','wb'))
pickle.dump(fTennis_clf, open('H5/fTennis.pkl','wb'))
pickle.dump(fCycling_clf, open('H5/fCycling.pkl','wb'))
pickle.dump(fHockey_clf, open('H5/fHockey.pkl','wb'))
pickle.dump(fArchery_clf, open('H5/fArchery.pkl','wb'))
pickle.dump(fVolleyball_clf, open('H5/fVolleyball.pkl','wb'))
pickle.dump(fModern_Pentathlon_clf, open('H5/fModern_Pentathlon.pkl','wb'))
pickle.dump(fTable_Tennis_clf, open('H5/fTable_Tennis.pkl','wb'))
pickle.dump(fSoftball_clf, open('H5/fSoftball.pkl','wb'))
pickle.dump(fRugby_Sevens_clf, open('H5/fRugby_Sevens.pkl','wb'))
pickle.dump(fTrampolining_clf, open('H5/fTrampolining.pkl','wb'))
pickle.dump(fBeach_Volleyball_clf, open('H5/fBeach_Volleyball.pkl','wb'))
pickle.dump(fTriathlon_clf, open('H5/fTriathlon.pkl','wb'))
pickle.dump(fGolf_clf, open('H5/fGolf.pkl','wb'))
pickle.dump(fSynchronized_Swimming_clf, open('H5/fSynchronized_Swimming.pkl','wb'))
pickle.dump(fRhythmic_Gymnastics_clf, open('H5/fRhythmic_Gymnastics.pkl','wb'))

In [782]:
pickle.dump(fCanoeing_clf, open('H5/fCanoeing.pkl','wb'))
pickle.dump(mCanoeing_clf, open('H5/mCanoeing.pkl','wb'))


In [783]:
pickle.dump(mBasketball_clf, open('H5/mBasketball.pkl','wb'))
pickle.dump(fBasketball_clf, open('H5/fBasketball.pkl','wb'))


In [784]:
pickle.dump(fFootball_clf, open('H5/fFootball.pkl','wb'))
pickle.dump(mFootball_clf, open('H5/mFootball.pkl','wb'))


In [785]:
pickle.dump(fBoxing_clf, open('H5/fBoxing.pkl','wb'))
pickle.dump(mBoxing_clf, open('H5/mBoxing.pkl','wb'))


### User input Calc

In [786]:
fAge_bin_names

['0', '1', '2', '3', '4', '5', '6', '7', '8']

In [637]:
mAge_bins

array([12.        , 18.55555556, 25.11111111, 31.66666667, 38.22222222,
       44.77777778, 51.33333333, 57.88888889, 64.44444444, 71.        ])

In [626]:
print(mHeight_bins)

[127. 138. 149. 160. 171. 182. 193. 204. 215. 226.]


In [633]:
mWeight_bins

array([ 37.        ,  56.66666667,  76.33333333,  96.        ,
       115.66666667, 135.33333333, 155.        , 174.66666667,
       194.33333333, 214.        ])

In [640]:
fAge_bins

array([11.        , 17.44444444, 23.88888889, 30.33333333, 36.77777778,
       43.22222222, 49.66666667, 56.11111111, 62.55555556, 69.        ])

In [641]:
fHeight_bins

array([127.        , 136.55555556, 146.11111111, 155.66666667,
       165.22222222, 174.77777778, 184.33333333, 193.88888889,
       203.44444444, 213.        ])

In [630]:
fWeight_bins

[ 25.          40.77777778  56.55555556  72.33333333  88.11111111
 103.88888889 119.66666667 135.44444444 151.22222222 167.        ]


In [635]:
# convert numpy arrays to pandas list
mAge_bins_list = mAge_bins.tolist()
mHeight_bins_list = mHeight_bins.tolist()
mWeight_bins_list = mWeight_bins.tolist()

fAge_bins_list = fAge_bins.tolist()
fHeight_bins_list = fHeight_bins.tolist()
fWeight_bins_list = fWeight_bins.tolist()

mAge_bins_list

[12.0,
 18.555555555555557,
 25.11111111111111,
 31.666666666666664,
 38.22222222222222,
 44.77777777777778,
 51.33333333333333,
 57.888888888888886,
 64.44444444444444,
 71.0]

In [636]:
# export variables - Bins
mAge_PicklePath = "PredVariables/mAge_bins.pkl"
mAge_bins_list.to_pickle(mAge_bins_list)

mHeight_PicklePath = "PredVariables/mHeight_bins.pkl"
mHeight_bins_list.to_pickle(mHeight_bins_list)

mWeight_PicklePath = "PredVariables/mWeight_bins.pkl"
mWeight_bins_list.to_pickle(mWeight_PicklePath)

fAge_PicklePath = "PredVariables/fAge_bins.pkl"
fAge_bins_list.to_pickle(fAge_PicklePath)

fHeight_PicklePath = "PredVariables/fHeight_bins.pkl"
fHeight_bins_list.to_pickle(fHeight_PicklePath)

fWeight_PicklePath = "PredVariables/fWeight_bins.pkl"
fWeight_bins_list.to_pickle(fWeight_PicklePath)


AttributeError: 'list' object has no attribute 'to_pickle'

In [580]:
# export variables - Bin Names
mAge_PicklePath = "/PredVariables/mAge_bin_names.pkl"
mAge_bins_list.to_pickle(mAge_PicklePath)

mHeight_PicklePath = "/PredVariables/mHeight_bin_names.pkl"
mAge_bins_list.to_pickle(mHeight_PicklePath)

mWeight_PicklePath = "/PredVariables/mWeight_bin_names.pkl"
mAge_bins_list.to_pickle(mWeight_PicklePath)

fAge_PicklePath = "/PredVariables/fAge_bin_names.pkl"
fAge_bins_list.to_pickle(fAge_PicklePath)

fHeight_PicklePath = "/PredVariables/fHeight_bin_names.pkl"
fAge_bins_list.to_pickle(fHeight_PicklePath)

fWeight_PicklePath = "/PredVariables/fWeight_bin_names.pkl"
fAge_bins_list.to_pickle(fWeight_PicklePath)

AttributeError: 'numpy.ndarray' object has no attribute 'to_pickle'

In [527]:
# Create example user info
user_Sex = 'M'
user_Age = 30
user_Height = 185
user_Weight = 77
user_NOC = 'URS'

In [528]:
# Turn user input into dataframe

user_df = pd.DataFrame(
    {
     "Age": [user_Age],
     "Height": [user_Height],
     "Weight": [user_Weight],
     "NOC": [user_NOC]
     }
)

user_df

Unnamed: 0,Age,Height,Weight,NOC
0,30,185,77,URS


In [529]:
user_df['Age'] = user_df['Age'].astype(float, errors = 'raise')
user_df['Height'] = user_df['Height'].astype(float, errors = 'raise')
user_df['Weight'] = user_df['Weight'].astype(float, errors = 'raise')

### User Men's Sports Predictor

In [530]:
# bin user info
user_df["Age"] = pd.cut(user_df["Age"], mAge_bins, labels=mAge_bin_names)
user_df["Height"] = pd.cut(user_df["Height"], mHeight_bins, labels=mHeight_bin_names)
user_df["Weight"] = pd.cut(user_df["Weight"], mWeight_bins, labels=mWeight_bin_names)
user_df["NOC"] = 'NOC_' + user_df["NOC"].astype(str)

user_df

Unnamed: 0,Age,Height,Weight,NOC
0,2,5,2,NOC_URS


In [531]:
# Edit values to match columns in regression dataframes
user_df["Age"] = 'Age_' + user_df["Age"].astype(str)
user_df["Height"] = 'Height_' + user_df["Height"].astype(str)
user_df["Weight"] = 'Weight_' + user_df["Weight"].astype(str)

user_df

Unnamed: 0,Age,Height,Weight,NOC
0,Age_2,Height_5,Weight_2,NOC_URS


In [532]:
mBasketball_empty = mBasketball_dummies[0:0]
mBasketball_empty

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE


In [790]:
mRegression_empty = mBasketball_empty.drop(['Basketball'], axis=1)
mRegression_empty

Unnamed: 0,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Height_0,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE


In [791]:
# export the regression_empty df for use in script
mRegression_empty.to_csv('./PredVariables/mRegression_empty.csv')

In [534]:
dummy_col = list(regression_empty.columns.values.tolist())
print(dummy_col)

['Age_0', 'Age_1', 'Age_2', 'Age_3', 'Age_4', 'Age_5', 'Age_6', 'Age_7', 'Age_8', 'Height_0', 'Height_1', 'Height_2', 'Height_3', 'Height_4', 'Height_5', 'Height_6', 'Height_7', 'Height_8', 'Weight_0', 'Weight_1', 'Weight_2', 'Weight_3', 'Weight_4', 'Weight_5', 'Weight_6', 'Weight_7', 'Weight_8', 'NOC_ABW', 'NOC_AFG', 'NOC_AGO', 'NOC_ALB', 'NOC_AND', 'NOC_ARE', 'NOC_ARG', 'NOC_ARM', 'NOC_ASM', 'NOC_ATG', 'NOC_AUS', 'NOC_AUT', 'NOC_AZE', 'NOC_BDI', 'NOC_BEL', 'NOC_BEN', 'NOC_BFA', 'NOC_BGD', 'NOC_BGR', 'NOC_BHR', 'NOC_BHS', 'NOC_BIH', 'NOC_BLR', 'NOC_BLZ', 'NOC_BMU', 'NOC_BOL', 'NOC_BRA', 'NOC_BRB', 'NOC_BRN', 'NOC_BTN', 'NOC_BWA', 'NOC_CAF', 'NOC_CAN', 'NOC_CHE', 'NOC_CHL', 'NOC_CHN', 'NOC_CIV', 'NOC_CMR', 'NOC_COD', 'NOC_COG', 'NOC_COL', 'NOC_COM', 'NOC_CPV', 'NOC_CRI', 'NOC_CUB', 'NOC_CYM', 'NOC_CYP', 'NOC_CZE', 'NOC_DEU', 'NOC_DJI', 'NOC_DMA', 'NOC_DNK', 'NOC_DOM', 'NOC_DZA', 'NOC_ECU', 'NOC_EGY', 'NOC_ERI', 'NOC_ESP', 'NOC_EST', 'NOC_ETH', 'NOC_EUN', 'NOC_FIN', 'NOC_FJI', 'NOC_FRA'

In [535]:
user_age_reg = user_df.at[0, 'Age']
user_age_reg

'Age_2'

In [536]:
# Create dummies version of user data
user_dummies_data = []

for col in dummy_col:
    if col == user_df.at[0, 'Age']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'Height']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'Weight']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'NOC']:
        user_dummies_data.append(1)
    else: user_dummies_data.append(0)

print(user_dummies_data)
            
            

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [537]:
# Create new Dataframe
user_df_reg = regression_empty.append(pd.Series(user_dummies_data, index=dummy_col), ignore_index=True)
user_df_reg

Unnamed: 0,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Height_0,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [538]:
sports

array(['Basketball', 'Judo', 'Badminton', 'Athletics', 'Weightlifting',
       'Wrestling', 'Rowing', 'Swimming', 'Football', 'Equestrianism',
       'Shooting', 'Gymnastics', 'Taekwondo', 'Boxing', 'Fencing',
       'Diving', 'Canoeing', 'Handball', 'Water Polo', 'Tennis',
       'Cycling', 'Hockey', 'Softball', 'Archery', 'Volleyball',
       'Synchronized Swimming', 'Modern Pentathlon', 'Table Tennis',
       'Baseball', 'Rhythmic Gymnastics', 'Rugby Sevens', 'Trampolining',
       'Beach Volleyball', 'Triathlon', 'Golf'], dtype=object)

In [539]:
msports = []
user_pred_list = []

In [540]:
mBasketball_user_pred = mBasketball_clf.predict_proba(user_df_reg)
mBasketball_user_pred = mBasketball_user_pred[:,1]
print(mBasketball_user_pred)
msports.append('Basketball')
user_pred_list.append(mBasketball_user_pred)

[0.65666563]


In [541]:
mJudo_user_pred = mJudo_clf.predict_proba(user_df_reg)
mJudo_user_pred = mJudo_user_pred[:,1]
print(mJudo_user_pred)
msports.append('Judo')
user_pred_list.append(mJudo_user_pred)

[0.23351568]


In [542]:
mBadminton_user_pred = mBadminton_clf.predict_proba(user_df_reg)
mBadminton_user_pred = mBadminton_user_pred[:,1]
print(mBadminton_user_pred)
msports.append('Badminton')
user_pred_list.append(mBadminton_user_pred)

[0.00138875]


In [543]:
mAthletics_user_pred = mAthletics_clf.predict_proba(user_df_reg)
mAthletics_user_pred = mAthletics_user_pred[:,1]
print(mAthletics_user_pred)
msports.append('Athletics')
user_pred_list.append(mAthletics_user_pred)

[0.50762209]


In [544]:
mWeightlifting_user_pred = mWeightlifting_clf.predict_proba(user_df_reg)
mWeightlifting_user_pred = mWeightlifting_user_pred[:,1]
print(mWeightlifting_user_pred)
msports.append('Weightlifting')
user_pred_list.append(mWeightlifting_user_pred)

[0.02696667]


In [545]:
mWrestling_user_pred = mWrestling_clf.predict_proba(user_df_reg)
mWrestling_user_pred = mWrestling_user_pred[:,1]
print(mWrestling_user_pred)
msports.append('Wrestling')
user_pred_list.append(mWrestling_user_pred)

[0.25713484]


In [546]:
mRowing_user_pred = mRowing_clf.predict_proba(user_df_reg)
mRowing_user_pred = mRowing_user_pred[:,1]
print(mRowing_user_pred)
msports.append('Rowing')
user_pred_list.append(mRowing_user_pred)

[0.80187808]


In [547]:
mSwimming_user_pred = mSwimming_clf.predict_proba(user_df_reg)
mSwimming_user_pred = mSwimming_user_pred[:,1]
print(mSwimming_user_pred)
msports.append('Swimming')
user_pred_list.append(mSwimming_user_pred)

[0.39040947]


In [548]:
mFootball_user_pred = mFootball_clf.predict_proba(user_df_reg)
mFootball_user_pred = mFootball_user_pred[:,1]
print(mFootball_user_pred)
msports.append('Football')
user_pred_list.append(mFootball_user_pred)

[0.29930617]


In [549]:
mEquestrianism_user_pred = mEquestrianism_clf.predict_proba(user_df_reg)
mEquestrianism_user_pred = mEquestrianism_user_pred[:,1]
print(mEquestrianism_user_pred)
msports.append('Equestrianism')
user_pred_list.append(mEquestrianism_user_pred)

[0.3628503]


In [550]:
mShooting_user_pred = mShooting_clf.predict_proba(user_df_reg)
mShooting_user_pred = mShooting_user_pred[:,1]
print(mShooting_user_pred)
msports.append('Shooting')
user_pred_list.append(mShooting_user_pred)

[0.33726037]


In [551]:
mGymnastics_user_pred = mGymnastics_clf.predict_proba(user_df_reg)
mGymnastics_user_pred = mGymnastics_user_pred[:,1]
print(mGymnastics_user_pred)
msports.append('Gymnastics')
user_pred_list.append(mGymnastics_user_pred)

[0.00561291]


In [552]:
mTaekwondo_user_pred = mTaekwondo_clf.predict_proba(user_df_reg)
mTaekwondo_user_pred = mTaekwondo_user_pred[:,1]
print(mTaekwondo_user_pred)
msports.append('Taekwondo')
user_pred_list.append(mTaekwondo_user_pred)

[0.00133288]


In [553]:
mBoxing_user_pred = mBoxing_clf.predict_proba(user_df_reg)
mBoxing_user_pred = mBoxing_user_pred[:,1]
print(mBoxing_user_pred)
msports.append('Boxing')
user_pred_list.append(mBoxing_user_pred)

[0.17803538]


In [554]:
mFencing_user_pred = mFencing_clf.predict_proba(user_df_reg)
mFencing_user_pred = mFencing_user_pred[:,1]
print(mFencing_user_pred)
msports.append('Fencing')
user_pred_list.append(mFencing_user_pred)

[0.71248809]


In [555]:
mDiving_user_pred = mDiving_clf.predict_proba(user_df_reg)
mDiving_user_pred = mDiving_user_pred[:,1]
print(mDiving_user_pred)
msports.append('Diving')
user_pred_list.append(mDiving_user_pred)

[0.07611969]


In [556]:
mCanoeing_user_pred = mCanoeing_clf.predict_proba(user_df_reg)
mCanoeing_user_pred = mCanoeing_user_pred[:,1]
print(mCanoeing_user_pred)
msports.append('Canoeing')
user_pred_list.append(mCanoeing_user_pred)

[0.57044313]


In [557]:
mHandball_user_pred = mHandball_clf.predict_proba(user_df_reg)
mHandball_user_pred = mHandball_user_pred[:,1]
print(mHandball_user_pred)
msports.append('Handball')
user_pred_list.append(mHandball_user_pred)

[0.75445894]


In [558]:
mWater_Polo_user_pred = mWater_Polo_clf.predict_proba(user_df_reg)
mWater_Polo_user_pred = mWater_Polo_user_pred[:,1]
print(mWater_Polo_user_pred)
msports.append('Water Polo')
user_pred_list.append(mWater_Polo_user_pred)

[0.73806677]


In [559]:
mTennis_user_pred = mTennis_clf.predict_proba(user_df_reg)
mTennis_user_pred = mTennis_user_pred[:,1]
print(mTennis_user_pred)
msports.append('Tennis')
user_pred_list.append(mTennis_user_pred)

[0.38599524]


In [560]:
mCycling_user_pred = mCycling_clf.predict_proba(user_df_reg)
mCycling_user_pred = mCycling_user_pred[:,1]
print(mCycling_user_pred)
msports.append('Cycling')
user_pred_list.append(mCycling_user_pred)

[0.31262905]


In [561]:
mHockey_user_pred = mHockey_clf.predict_proba(user_df_reg)
mHockey_user_pred = mHockey_user_pred[:,1]
print(mHockey_user_pred)
msports.append('Hockey')
user_pred_list.append(mHockey_user_pred)

[0.26702578]


In [562]:
mArchery_user_pred = mArchery_clf.predict_proba(user_df_reg)
mArchery_user_pred = mArchery_user_pred[:,1]
print(mArchery_user_pred)
msports.append('Archery')
user_pred_list.append(mArchery_user_pred)

[0.25547963]


In [563]:
mVolleyball_user_pred = mVolleyball_clf.predict_proba(user_df_reg)
mVolleyball_user_pred = mVolleyball_user_pred[:,1]
print(mVolleyball_user_pred)
msports.append('Volleyball')
user_pred_list.append(mVolleyball_user_pred)

[0.7937518]


In [564]:
mModern_Pentathlon_user_pred = mModern_Pentathlon_clf.predict_proba(user_df_reg)
mModern_Pentathlon_user_pred = mModern_Pentathlon_user_pred[:,1]
print(mModern_Pentathlon_user_pred)
msports.append('Modern Pentathlon')
user_pred_list.append(mModern_Pentathlon_user_pred)

[0.59621373]


In [565]:
mTable_Tennis_user_pred = mTable_Tennis_clf.predict_proba(user_df_reg)
mTable_Tennis_user_pred = mTable_Tennis_user_pred[:,1]
print(mTable_Tennis_user_pred)
msports.append('Table Tennis')
user_pred_list.append(mTable_Tennis_user_pred)

[0.08427226]


In [566]:
mBaseball_user_pred = mBaseball_clf.predict_proba(user_df_reg)
mBaseball_user_pred = mBaseball_user_pred[:,1]
print(mBaseball_user_pred)
msports.append('Baseball')
user_pred_list.append(mBaseball_user_pred)

[0.00094955]


In [567]:
mRugby_Sevens_user_pred = mRugby_Sevens_clf.predict_proba(user_df_reg)
mRugby_Sevens_user_pred = mRugby_Sevens_user_pred[:,1]
print(mRugby_Sevens_user_pred)
msports.append('Rugby Sevens')
user_pred_list.append(mRugby_Sevens_user_pred)

[0.00043854]


In [568]:
mTrampolining_user_pred = mTrampolining_clf.predict_proba(user_df_reg)
mTrampolining_user_pred = mTrampolining_user_pred[:,1]
print(mTrampolining_user_pred)
msports.append('Trampolining')
user_pred_list.append(mTrampolining_user_pred)

[1.38238504e-05]


In [569]:
mBeach_Volleyball_user_pred = mBeach_Volleyball_clf.predict_proba(user_df_reg)
mBeach_Volleyball_user_pred = mBeach_Volleyball_user_pred[:,1]
print(mBeach_Volleyball_user_pred)
msports.append('Beach Volleyball')
user_pred_list.append(mBeach_Volleyball_user_pred)

[0.00343116]


In [570]:
mTriathlon_user_pred = mTriathlon_clf.predict_proba(user_df_reg)
mTriathlon_user_pred = mTriathlon_user_pred[:,1]
print(mTriathlon_user_pred)
msports.append('Triathlon')
user_pred_list.append(mTriathlon_user_pred)

[0.00035686]


In [571]:
# mGolf_user_pred = mGolf_clf.predict_proba(user_df_reg)
# mGolf_user_pred = mGolf_user_pred[:,1]
# print(mGolf_user_pred)
# msports.append('Golf')
# user_pred_list.append(mGolf_user_pred)

In [572]:
user_max = max(user_pred_list)
user_max_pos = user_pred_list.index(user_max)
user_max_sport = msports[user_max_pos]
print(user_max_sport)

Rowing


In [573]:
user_min = min(user_pred_list)
user_min_pos = user_pred_list.index(user_min)
user_min_sport = msports[user_min_pos]
print(user_min_sport)

Trampolining


### User Women's Sports Predictor

In [379]:
# bin user info
user_df["Age"] = pd.cut(user_df["Age"], fAge_bins, labels=fAge_bin_names)
user_df["Height"] = pd.cut(user_df["Height"], fHeight_bins, labels=fHeight_bin_names)
user_df["Weight"] = pd.cut(user_df["Weight"], fWeight_bins, labels=fWeight_bin_names)
user_df["NOC"] = 'NOC_' + user_df["NOC"].astype(str)

user_df

Unnamed: 0,Age,Height,Weight,NOC
0,2,5,3,NOC_USA


In [380]:
# Edit values to match columns in regression dataframes
user_df["Age"] = 'Age_' + user_df["Age"].astype(str)
user_df["Height"] = 'Height_' + user_df["Height"].astype(str)
user_df["Weight"] = 'Weight_' + user_df["Weight"].astype(str)

user_df

Unnamed: 0,Age,Height,Weight,NOC
0,Age_2,Height_5,Weight_3,NOC_USA


In [788]:
fBasketball_empty = fBasketball_dummies[0:0]
fBasketball_empty

Unnamed: 0,Basketball,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE


In [792]:
fRegression_empty = fBasketball_empty.drop(['Basketball'], axis=1)
fRegression_empty

Unnamed: 0,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Height_0,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE


In [793]:
fRegression_empty.to_csv('./PredVariables/fRegression_empty.csv')

In [383]:
dummy_col = list(regression_empty.columns.values.tolist())
print(dummy_col)

['Age_0', 'Age_1', 'Age_2', 'Age_3', 'Age_4', 'Age_5', 'Age_6', 'Age_7', 'Age_8', 'Height_0', 'Height_1', 'Height_2', 'Height_3', 'Height_4', 'Height_5', 'Height_6', 'Height_7', 'Height_8', 'Weight_0', 'Weight_1', 'Weight_2', 'Weight_3', 'Weight_4', 'Weight_5', 'Weight_6', 'Weight_7', 'Weight_8', 'NOC_ABW', 'NOC_AFG', 'NOC_AGO', 'NOC_ALB', 'NOC_AND', 'NOC_ARE', 'NOC_ARG', 'NOC_ARM', 'NOC_ASM', 'NOC_ATG', 'NOC_AUS', 'NOC_AUT', 'NOC_AZE', 'NOC_BDI', 'NOC_BEL', 'NOC_BEN', 'NOC_BFA', 'NOC_BGD', 'NOC_BGR', 'NOC_BHR', 'NOC_BHS', 'NOC_BIH', 'NOC_BLR', 'NOC_BLZ', 'NOC_BMU', 'NOC_BOL', 'NOC_BRA', 'NOC_BRB', 'NOC_BRN', 'NOC_BTN', 'NOC_BWA', 'NOC_CAF', 'NOC_CAN', 'NOC_CHE', 'NOC_CHL', 'NOC_CHN', 'NOC_CIV', 'NOC_CMR', 'NOC_COD', 'NOC_COG', 'NOC_COL', 'NOC_COM', 'NOC_CPV', 'NOC_CRI', 'NOC_CUB', 'NOC_CYM', 'NOC_CYP', 'NOC_CZE', 'NOC_DEU', 'NOC_DJI', 'NOC_DMA', 'NOC_DNK', 'NOC_DOM', 'NOC_DZA', 'NOC_ECU', 'NOC_EGY', 'NOC_ERI', 'NOC_ESP', 'NOC_EST', 'NOC_ETH', 'NOC_EUN', 'NOC_FIN', 'NOC_FJI', 'NOC_FRA'

In [384]:
user_age_reg = user_df.at[0, 'Age']
user_age_reg

'Age_2'

In [385]:
# Create dummies version of user data
user_dummies_data = []

for col in dummy_col:
    if col == user_df.at[0, 'Age']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'Height']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'Weight']:
        user_dummies_data.append(1)
    elif col == user_df.at[0, 'NOC']:
        user_dummies_data.append(1)
    else: user_dummies_data.append(0)

print(user_dummies_data)
            
            

[0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]


In [386]:
# Create new Dataframe
user_df_reg = regression_empty.append(pd.Series(user_dummies_data, index=dummy_col), ignore_index=True)
user_df_reg

Unnamed: 0,Age_0,Age_1,Age_2,Age_3,Age_4,Age_5,Age_6,Age_7,Age_8,Height_0,...,NOC_VGB,NOC_VIR,NOC_VNM,NOC_VUT,NOC_WSM,NOC_XKX,NOC_YEM,NOC_ZAF,NOC_ZMB,NOC_ZWE
0,0,0,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [387]:
sports

array(['Basketball', 'Judo', 'Badminton', 'Athletics', 'Weightlifting',
       'Wrestling', 'Rowing', 'Swimming', 'Football', 'Equestrianism',
       'Shooting', 'Gymnastics', 'Taekwondo', 'Boxing', 'Fencing',
       'Diving', 'Canoeing', 'Handball', 'Water Polo', 'Tennis',
       'Cycling', 'Hockey', 'Softball', 'Archery', 'Volleyball',
       'Synchronized Swimming', 'Modern Pentathlon', 'Table Tennis',
       'Baseball', 'Rhythmic Gymnastics', 'Rugby Sevens', 'Trampolining',
       'Beach Volleyball', 'Triathlon', 'Golf'], dtype=object)

In [391]:
fsports = []
user_pred_list = []

In [392]:
fBasketball_user_pred = fBasketball_clf.predict_proba(user_df_reg)
fBasketball_user_pred = fBasketball_user_pred[:,1]
print(fBasketball_user_pred)
fsports.append('Basketball')
user_pred_list.append(fBasketball_user_pred)

[0.48771829]


In [393]:
fJudo_user_pred = fJudo_clf.predict_proba(user_df_reg)
fJudo_user_pred = fJudo_user_pred[:,1]
print(fJudo_user_pred)
fsports.append('Judo')
user_pred_list.append(fJudo_user_pred)

[0.74458272]


In [394]:
fBadminton_user_pred = fBadminton_clf.predict_proba(user_df_reg)
fBadminton_user_pred = fBadminton_user_pred[:,1]
print(fBadminton_user_pred)
fsports.append('Badminton')
user_pred_list.append(fBadminton_user_pred)

[0.05088235]


In [395]:
fAthletics_user_pred = fAthletics_clf.predict_proba(user_df_reg)
fAthletics_user_pred = fAthletics_user_pred[:,1]
print(fAthletics_user_pred)
fsports.append('Athletics')
user_pred_list.append(fAthletics_user_pred)

[0.68794466]


In [396]:
fWeightlifting_user_pred = fWeightlifting_clf.predict_proba(user_df_reg)
fWeightlifting_user_pred = fWeightlifting_user_pred[:,1]
print(fWeightlifting_user_pred)
fsports.append('Weightlifting')
user_pred_list.append(fWeightlifting_user_pred)

[0.48367422]


In [397]:
fWrestling_user_pred = fWrestling_clf.predict_proba(user_df_reg)
fWrestling_user_pred = fWrestling_user_pred[:,1]
print(fWrestling_user_pred)
fsports.append('Wrestling')
user_pred_list.append(fWrestling_user_pred)

[0.76531922]


In [398]:
fRowing_user_pred = fRowing_clf.predict_proba(user_df_reg)
fRowing_user_pred = fRowing_user_pred[:,1]
print(mRowing_user_pred)
msports.append('Rowing')
user_pred_list.append(mRowing_user_pred)

[0.66252353]


In [399]:
fSwimming_user_pred = fSwimming_clf.predict_proba(user_df_reg)
fSwimming_user_pred = fSwimming_user_pred[:,1]
print(fSwimming_user_pred)
fsports.append('Swimming')
user_pred_list.append(fSwimming_user_pred)

[0.10197748]


In [400]:
mFootball_user_pred = mFootball_clf.predict_proba(user_df_reg)
mFootball_user_pred = mFootball_user_pred[:,1]
print(fFootball_user_pred)
fsports.append('Football')
user_pred_list.append(fFootball_user_pred)

[0.01318691]


In [401]:
mEquestrianism_user_pred = mEquestrianism_clf.predict_proba(user_df_reg)
mEquestrianism_user_pred = mEquestrianism_user_pred[:,1]
print(fEquestrianism_user_pred)
fsports.append('Equestrianism')
user_pred_list.append(fEquestrianism_user_pred)

[0.00885191]


In [402]:
mShooting_user_pred = mShooting_clf.predict_proba(user_df_reg)
mShooting_user_pred = mShooting_user_pred[:,1]
print(fShooting_user_pred)
fsports.append('Shooting')
user_pred_list.append(fShooting_user_pred)

[0.40742372]


In [403]:
mGymnastics_user_pred = mGymnastics_clf.predict_proba(user_df_reg)
mGymnastics_user_pred = mGymnastics_user_pred[:,1]
print(fGymnastics_user_pred)
fsports.append('Gymnastics')
user_pred_list.append(fGymnastics_user_pred)

[0.00014838]


In [404]:
mTaekwondo_user_pred = mTaekwondo_clf.predict_proba(user_df_reg)
mTaekwondo_user_pred = mTaekwondo_user_pred[:,1]
print(fTaekwondo_user_pred)
fsports.append('Taekwondo')
user_pred_list.append(fTaekwondo_user_pred)

[0.45826241]


In [405]:
mBoxing_user_pred = mBoxing_clf.predict_proba(user_df_reg)
mBoxing_user_pred = mBoxing_user_pred[:,1]
print(fBoxing_user_pred)
fsports.append('Boxing')
user_pred_list.append(fBoxing_user_pred)

[0.22740991]


In [406]:
mFencing_user_pred = mFencing_clf.predict_proba(user_df_reg)
mFencing_user_pred = mFencing_user_pred[:,1]
print(fFencing_user_pred)
fsports.append('Fencing')
user_pred_list.append(fFencing_user_pred)

[0.2325719]


In [407]:
mDiving_user_pred = mDiving_clf.predict_proba(user_df_reg)
mDiving_user_pred = mDiving_user_pred[:,1]
print(fDiving_user_pred)
fsports.append('Diving')
user_pred_list.append(fDiving_user_pred)

[0.0006849]


In [408]:
mCanoeing_user_pred = mCanoeing_clf.predict_proba(user_df_reg)
mCanoeing_user_pred = mCanoeing_user_pred[:,1]
print(fCanoeing_user_pred)
fsports.append('Canoeing')
user_pred_list.append(fCanoeing_user_pred)

[0.22081763]


In [409]:
mHandball_user_pred = mHandball_clf.predict_proba(user_df_reg)
mHandball_user_pred = mHandball_user_pred[:,1]
print(fHandball_user_pred)
fsports.append('Handball')
user_pred_list.append(fHandball_user_pred)

[0.70452191]


In [410]:
mWater_Polo_user_pred = mWater_Polo_clf.predict_proba(user_df_reg)
mWater_Polo_user_pred = mWater_Polo_user_pred[:,1]
print(fWater_Polo_user_pred)
fsports.append('Water Polo')
user_pred_list.append(fWater_Polo_user_pred)

[0.82287189]


In [411]:
mTennis_user_pred = mTennis_clf.predict_proba(user_df_reg)
mTennis_user_pred = mTennis_user_pred[:,1]
print(fTennis_user_pred)
fsports.append('Tennis')
user_pred_list.append(fTennis_user_pred)

[0.1738451]


In [412]:
mCycling_user_pred = mCycling_clf.predict_proba(user_df_reg)
mCycling_user_pred = mCycling_user_pred[:,1]
print(fCycling_user_pred)
fsports.append('Cycling')
user_pred_list.append(fCycling_user_pred)

[0.07065454]


In [413]:
mHockey_user_pred = mHockey_clf.predict_proba(user_df_reg)
mHockey_user_pred = mHockey_user_pred[:,1]
print(fHockey_user_pred)
fsports.append('Hockey')
user_pred_list.append(fHockey_user_pred)

[0.03713397]


In [414]:
mArchery_user_pred = mArchery_clf.predict_proba(user_df_reg)
mArchery_user_pred = mArchery_user_pred[:,1]
print(fArchery_user_pred)
fsports.append('Archery')
user_pred_list.append(fArchery_user_pred)

[0.53799866]


In [415]:
mVolleyball_user_pred = mVolleyball_clf.predict_proba(user_df_reg)
mVolleyball_user_pred = mVolleyball_user_pred[:,1]
print(fVolleyball_user_pred)
fsports.append('Volleyball')
user_pred_list.append(fVolleyball_user_pred)

[0.38625746]


In [416]:
mModern_Pentathlon_user_pred = mModern_Pentathlon_clf.predict_proba(user_df_reg)
mModern_Pentathlon_user_pred = mModern_Pentathlon_user_pred[:,1]
print(fModern_Pentathlon_user_pred)
fsports.append('Modern Pentathlon')
user_pred_list.append(fModern_Pentathlon_user_pred)

[0.00081643]


In [417]:
mTable_Tennis_user_pred = mTable_Tennis_clf.predict_proba(user_df_reg)
mTable_Tennis_user_pred = mTable_Tennis_user_pred[:,1]
print(fTable_Tennis_user_pred)
fsports.append('Table Tennis')
user_pred_list.append(fTable_Tennis_user_pred)

[0.07106406]


In [418]:
mBaseball_user_pred = mBaseball_clf.predict_proba(user_df_reg)
mBaseball_user_pred = mBaseball_user_pred[:,1]
print(fBaseball_user_pred)
fsports.append('Baseball')
user_pred_list.append(fBaseball_user_pred)

[0.91911931]


In [419]:
mRugby_Sevens_user_pred = mRugby_Sevens_clf.predict_proba(user_df_reg)
mRugby_Sevens_user_pred = mRugby_Sevens_user_pred[:,1]
print(fRugby_Sevens_user_pred)
fsports.append('Rugby Sevens')
user_pred_list.append(fRugby_Sevens_user_pred)

[0.97152557]


In [420]:
mTrampolining_user_pred = mTrampolining_clf.predict_proba(user_df_reg)
mTrampolining_user_pred = mTrampolining_user_pred[:,1]
print(fTrampolining_user_pred)
fsports.append('Trampolining')
user_pred_list.append(fTrampolining_user_pred)

[7.42227548e-05]


In [574]:
mBeach_Volleyball_user_pred = mBeach_Volleyball_clf.predict_proba(user_df_reg)
mBeach_Volleyball_user_pred = mBeach_Volleyball_user_pred[:,1]
print(fBeach_Volleyball_user_pred)
fsports.append('Beach Volleyball')
user_pred_list.append(fBeach_Volleyball_user_pred)

NameError: name 'fBeach_Volleyball_user_pred' is not defined

In [422]:
mTriathlon_user_pred = mTriathlon_clf.predict_proba(user_df_reg)
mTriathlon_user_pred = mTriathlon_user_pred[:,1]
print(fTriathlon_user_pred)
fsports.append('Triathlon')
user_pred_list.append(fTriathlon_user_pred)

[0.00093993]


In [423]:
# mGolf_user_pred = mGolf_clf.predict_proba(user_df_reg)
# mGolf_user_pred = mGolf_user_pred[:,1]
# print(fGolf_user_pred)
# fsports.append('Golf')
# user_pred_list.append(fGolf_user_pred)

In [427]:
user_max = max(user_pred_list)
user_max_pos = user_pred_list.index(user_max)
user_max_sport = fsports[user_max_pos]
print(user_max_sport)

Rugby Sevens


In [428]:
user_min = min(user_pred_list)
user_min_pos = user_pred_list.index(user_min)
user_min_sport = fsports[user_min_pos]
print(user_min_sport)

Trampolining
