# Tensorflow (But actually not Tensorflow)

## Dataset

In [1]:
import pandas as pd
import numpy as np
import joblib

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

from sklearn.preprocessing import LabelEncoder

dataset = pd.read_csv('../data-files/dataset.csv')
dataset

Unnamed: 0,sport,event,pos,medal,weight,height,sex,year,age,bmi,podium
0,Athletics,"3,500 metres Race Walk, Men",3.0,Bronze,76.0,184.0,Male,1908,29.0,22.448015,1
1,Swimming,"400 metres Freestyle, Men",2.0,Silver,65.0,170.0,Male,1908,17.0,22.491349,1
2,Swimming,"1,500 metres Freestyle, Men",3.0,Bronze,65.0,170.0,Male,1908,17.0,22.491349,1
3,Tennis,"Singles, Covered Courts, Men",3.0,Bronze,84.0,188.0,Male,1912,29.0,23.766410,1
4,Artistic Swimming,"Solo, Women",18.0,na,58.0,165.0,Female,1988,19.0,21.303949,0
...,...,...,...,...,...,...,...,...,...,...,...
80901,Bobsleigh,"Monobob, Women",12.0,na,74.0,180.0,Female,2022,28.0,22.839506,0
80902,Bobsleigh,"Monobob, Women",13.0,na,75.0,169.0,Female,2022,32.0,26.259585,0
80903,Bobsleigh,"Monobob, Women",14.0,na,73.0,171.0,Female,2022,29.0,24.964946,0
80904,Bobsleigh,"Monobob, Women",17.0,na,76.0,175.0,Female,2022,20.0,24.816327,0


## Encoding

In [2]:
# Convert categorical variables to numerical
sex_encoder = LabelEncoder()

dataset['sex'] = sex_encoder.fit_transform(dataset['sex'])
dataset

Unnamed: 0,sport,event,pos,medal,weight,height,sex,year,age,bmi,podium
0,Athletics,"3,500 metres Race Walk, Men",3.0,Bronze,76.0,184.0,1,1908,29.0,22.448015,1
1,Swimming,"400 metres Freestyle, Men",2.0,Silver,65.0,170.0,1,1908,17.0,22.491349,1
2,Swimming,"1,500 metres Freestyle, Men",3.0,Bronze,65.0,170.0,1,1908,17.0,22.491349,1
3,Tennis,"Singles, Covered Courts, Men",3.0,Bronze,84.0,188.0,1,1912,29.0,23.766410,1
4,Artistic Swimming,"Solo, Women",18.0,na,58.0,165.0,0,1988,19.0,21.303949,0
...,...,...,...,...,...,...,...,...,...,...,...
80901,Bobsleigh,"Monobob, Women",12.0,na,74.0,180.0,0,2022,28.0,22.839506,0
80902,Bobsleigh,"Monobob, Women",13.0,na,75.0,169.0,0,2022,32.0,26.259585,0
80903,Bobsleigh,"Monobob, Women",14.0,na,73.0,171.0,0,2022,29.0,24.964946,0
80904,Bobsleigh,"Monobob, Women",17.0,na,76.0,175.0,0,2022,20.0,24.816327,0


## Random Forest Classification Model

In [3]:
#splitting the data in a training and test set
train_dataset = dataset.sample(frac=0.8, random_state=0)
test_dataset = dataset.drop(train_dataset.index)

In [4]:
feature_columns = ['height', 'weight', 'age', 'sex', 'bmi']
target_column = 'sport'

train_features = pd.get_dummies(train_dataset[feature_columns], drop_first=True)
test_features = pd.get_dummies(test_dataset[feature_columns], drop_first=True)

# Ensure both train and test datasets have the same columns after encoding
train_features, test_features = train_features.align(test_features, join='left', axis=1, fill_value=0)

# Extract labels (target values)
train_labels = train_dataset[target_column]
test_labels = test_dataset[target_column]

# Convert to numpy arrays for training with sklearn
X_train = train_features.to_numpy()
y_train = train_labels.to_numpy()

X_test = test_features.to_numpy()
y_test = test_labels.to_numpy()

clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)

# You can use clf.predict with your test dataset, for example:
predictions = clf.predict(X_test)
predictions

joblib.dump(clf, '../SportPredictionBackend/random_forest_model.pkl')

['../SportPredictionBackend/random_forest_model.pkl']