In [113]:
# Import Library
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error

In [129]:
# import data
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,years,leangth,age
0,1994,2h 22m,12
1,1972,2h 55m,16
2,2008,2h 32m,16
3,1974,3h 22m,16
4,1957,1h 36m,12


In [130]:
# view unique values
df["age"].unique()

array(['12', '16', '18', '2', '6', nan, '0', '(Ba', 'Ban', 'Not', 'R',
       'Inf'], dtype=object)

In [131]:
# view data with at least 1 NaN values
df[pd.isnull(df).any(axis=1)]

Unnamed: 0,years,leangth,age
30,2001,2h 5m0,
51,2023,2h 27m,
72,2003,2h16,
82,2023,3h12,
108,2001,2h 2m6,
131,2013,3h16,
132,1921,54m0,
198,2015,2h16,
200,1924,45m6,
207,1939,2h 9m0,


In [135]:
# remove rows with NaN values and reset indexes in case any rows were dropped
df = df.dropna()
df = df.reset_index(drop=True)
df["rank"] = df["index"]
df.head()

Unnamed: 0,level_0,index,years,leangth,age,rank
0,0,0,1994,2h 22m,12,0
1,1,1,1972,2h 55m,16,1
2,2,2,2008,2h 32m,16,2
3,3,3,1974,3h 22m,16,3
4,4,4,1957,1h 36m,12,4


In [None]:
df = df.drop(columns=["index"])
df = df.drop(columns=["level_0"])

In [140]:
print(df.head(20))

    years leangth age  rank
0    1994  2h 22m  12     0
1    1972  2h 55m  16     1
2    2008  2h 32m  16     2
3    1974  3h 22m  16     3
4    1957  1h 36m  12     4
5    1993  3h 15m  12     5
6    2003  3h 21m  12     6
7    1994  2h 34m  16     7
8    2001  2h 58m  12     8
9    1966  2h 41m  18     9
10   1994  2h 22m  12    10
11   2002  2h 59m  12    11
12   1999  2h 19m  18    12
13   2010  2h 28m  12    13
14   1980  2h 4m1   2    14
15   1999  2h 16m  16    15
16   1990  2h 25m  16    16
17   1975  2h 13m  16    17
18   1995  2h 7m1   6    18
19   1946  2h 10m   6    19


In [141]:
# reorganize columns
df = df[["rank", "years", "leangth", "age"]]
print(df.head())

   rank  years leangth age
0     0   1994  2h 22m  12
1     1   1972  2h 55m  16
2     2   2008  2h 32m  16
3     3   1974  3h 22m  16
4     4   1957  1h 36m  12


In [156]:
df["length"] = df["leangth"]
df.drop(columns=["leangth"], inplace=True)
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["length"] = df["leangth"]


Unnamed: 0,rank,years,age,length
0,0,1994,12,2h 22m
1,1,1972,16,2h 55m
2,2,2008,16,2h 32m
3,3,1974,16,3h 22m
4,4,1957,12,1h 36m


In [161]:
df["length"].unique()

array(['2h 22m', '2h 55m', '2h 32m', '3h 22m', '1h 36m', '3h 15m',
       '3h 21m', '2h 34m', '2h 58m', '2h 41m', '2h 59m', '2h 19m',
       '2h 28m', '2h 4m1', '2h 16m', '2h 25m', '2h 13m', '2h 7m1',
       '2h 10m', '2h 49m', '3h 27m', '1h 58m', '1h 56m', '3h 9m1',
       '2h 17m', '2h 1m1', '2h 20m', '2h 30m', '2h 12m', '1h 49m',
       '2h 35m', '1h 28m', '1h 50m', '2h 31m', '1h 59m', '1h 46m',
       '1h 29m', '1h 42m', '1h 52m', '1h 27m', '2h 54m', '2h 46m',
       '1h 57m', '2h 27m', '2h 45m', '1h 53m', '1h 38m', '1h 55m',
       '2h 29m', '2h 26m', '2h 5m(', '2h 33m', '2h 44m', '1h 35m',
       '2h 2m1', '1h 45m', '2h 40m', '1h 21m', '3h 1m1', '2h 14m',
       '2h 6m1', '3h 49m', '2h 23m', '2h 50m', '1h 43m', '2h 11m',
       '1h 48m', '2h 21m', '1h 39m', '3h 38m', '2h 5m1', '2h 8m1',
       '1h 47m', '2h 9m1', '2h 3m1', '2h 42m', '1h 44m', '2h 18m',
       '1h 54m', '2h 36m', '1h 37m', '2h 38m', '2h 15m', '1h 51m',
       '1h 31m', '2h 52m', '1h 40m', '3h 58m', '1h 33m', '1h 2

In [172]:
# Define transformations for different columns
transformers = [
    (
        "age_years",
        SimpleImputer(strategy="most_frequent"),  # Change to most_frequent strategy
        ["age", "years"],
    ),
    ("length", FunctionTransformer(extract_duration_info), ["length"]),
]

# Create ColumnTransformer
preprocessor = ColumnTransformer(transformers, remainder="passthrough")

# Define pipeline for the model
pipeline = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
        ("regressor", RandomForestRegressor()),
    ]
)

# Split data into X and y for rank prediction
X = df.drop("rank", axis=1)  # Features for rank prediction
y = df["rank"]  # Target variable for rank prediction

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Fit pipeline on training data for rank prediction
pipeline.fit(X_train, y_train)

In [173]:
# view the df
print(df.head())
# export dataframe to csv
df.to_csv('data_vs2.csv', index=False)

   rank  years age  length
0     0   1994  12  2h 22m
1     1   1972  16  2h 55m
2     2   2008  16  2h 32m
3     3   1974  16  3h 22m
4     4   1957  12  1h 36m


In [174]:
# Define preds
y_preds = pipeline.predict(X_test)

In [175]:
# view mean squared error - ours really really bad - 0 is the best
mse = mean_squared_error(y_test, y_preds)
print(mse)

6015.033592196961


In [176]:
# Calculate R^2 score on the training set
train_score = pipeline.score(X_train, y_train)
print("Training R^2 Score:", train_score)

# Calculate R^2 score on the test set
test_score = pipeline.score(X_test, y_test)
print("Testing R^2 Score:", test_score)

Training R^2 Score: 0.8252059575132096
Testing R^2 Score: -0.21572380148610093


In [177]:
# Improve
np.random.seed(42)
for i in range(10, 100, 10):
    print(f"Trying model with {i} estimators...")
    print(f"Model accuracy on test set: {pipeline.score(X_train, y_train) * 100:2f}%")
    print("")

Trying model with 10 estimators...
Model accuracy on test set: 82.520596%

Trying model with 20 estimators...
Model accuracy on test set: 82.520596%

Trying model with 30 estimators...
Model accuracy on test set: 82.520596%

Trying model with 40 estimators...
Model accuracy on test set: 82.520596%

Trying model with 50 estimators...
Model accuracy on test set: 82.520596%

Trying model with 60 estimators...
Model accuracy on test set: 82.520596%

Trying model with 70 estimators...
Model accuracy on test set: 82.520596%

Trying model with 80 estimators...
Model accuracy on test set: 82.520596%

Trying model with 90 estimators...
Model accuracy on test set: 82.520596%



In [None]:
# export the model
import pickle

filename = 'final_model.sav'

with open(filename, 'wb') as file:
    pickle.dump(pipeline, file)