In [39]:
import pandas as pd
import numpy as np
from datetime import datetime
import xgboost as xgb

## Data loading

In [40]:
data_path = ''
df = pd.read_csv(data_path)

FileNotFoundError: [Errno 2] No such file or directory: ''

## Data preprocessing

In [None]:
df_year = df['startYear'].replace(0,df['startYear'].median())
df['age'] = datetime.now().year - df_year
df['age'] = df['age'].astype(float)
df = df.drop('startYear', axis=1)

In [None]:
cat_cols = ['titleType']
excluded_cols = ['production_companies','production_countries','tagline','original_language','runtime','endYear','budget','revenue','status','video',
                 'genres_y','adult','isOriginalTitle','language']
preprocessed_cols = ['directors','writers','startYear']
num_cols = ['averageRating','numVotes','runtimeMinutes','isAdult','episodeNumber','seasonNumber','popularity','ordering','attributes']

In [None]:
df = df.drop(excluded_cols, axis=1)

In [None]:
# removing errors in data
df['runtimeMinutes'] = df['runtimeMinutes'].apply(lambda x: 1 if x < 1 else x)
df['isAdult'] = df['isAdult'].apply(lambda x: 0 if x > 1.1 else x) # el .1 es por si llega a haber un floating point error


df['episodeNumber'] = df['episodeNumber'].fillna(df['episodeNumber'].median())
df['episodeNumber'] = df['episodeNumber'].astype(int)
df['episodeNumber'] = df['episodeNumber'].apply(lambda x: 1 if x < 1 else x)
df['episodeNumber'] = np.log10(df['episodeNumber'])

df['seasonNumber'] = df['seasonNumber'].fillna(df['seasonNumber'].median())




df['hasOrdering'] = df['ordering'].notna().astype(int)
df['hasPopularity'] = df['popularity'].notna().astype(int)


df['popularity'] = df['popularity'].fillna(df['popularity'].median())
df['ordering'] = df['ordering'].fillna(df['ordering'].median())

In [None]:
def has_attributes(row):
    if pd.isna(row['attributes']) or row['attributes'] == '0':
        return 0
    else:
        return 1

# Apply the function to create the new column
df['hasAttributes'] = df.apply(has_attributes, axis=1)

In [None]:
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

  df[num_cols] = df[num_cols].fillna(df[num_cols].median())


In [None]:
# reduce categorias, y por ende features

df['titleType'] = df['titleType'].replace('tvEpisode', 'tv')
df['titleType'] = df['titleType'].replace('tvMiniSeries', 'tv')
df['titleType'] = df['titleType'].replace('tvSeries', 'tv')
df['titleType'] = df['titleType'].replace('tvSpecial', 'tv')


df['titleType'] = df['titleType'].replace('tvShort', 'short')
df['titleType'] = df['titleType'].replace('tvMovie', 'movie')

In [None]:
genres_split = df['genres_x'].str.get_dummies(',')
df = df.drop('genres_x', axis=1)

df_cat = df[cat_cols]
df_cat = df_cat.fillna('missing')
df_encoded = pd.get_dummies(df_cat, columns=cat_cols, prefix=cat_cols)



df_pred = pd.concat([df_encoded,genres_split], axis=1)
df_pred = pd.concat([df_pred,df], axis=1)

df_pred = df_pred.drop(df_cat.columns.to_list(), axis=1)

In [None]:
# cuantil de mejores escritores

directors_df = df['directors'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('director')
df_split = df.drop('directors', axis=1).join(directors_df)

director_appearances = df_split['director'].value_counts().reset_index().rename(columns={'index': 'director', 'director': 'appearances'})
popular_directors = director_appearances[director_appearances['appearances'] > 10] ################### se puede modificar las appearances
average_rating_per_director = df_split.groupby('director')['averageRating'].mean().reset_index()
popular_directors = pd.merge(average_rating_per_director, popular_directors, on='director')
popular_directors = popular_directors.sort_values(by='averageRating', ascending=False)

threshold_rating = popular_directors['averageRating'].quantile(0.9)################### se puede modificar el cuantil
top_directors = popular_directors[popular_directors['averageRating'] >= threshold_rating]

In [None]:
# cuantil de mejores escritores

writers_df = df['writers'].str.split(', ', expand=True).stack().reset_index(level=1, drop=True).rename('writer')
df_split = df.drop('writers', axis=1).join(writers_df)

writer_appearances = df_split['writer'].value_counts().reset_index().rename(columns={'index': 'writer', 'writer': 'appearances'})
popular_writers = writer_appearances[writer_appearances['appearances'] > 10] ################### se puede modificar las appearances
average_rating_per_writer = df_split.groupby('writer')['averageRating'].mean().reset_index()
popular_writers = pd.merge(average_rating_per_writer, popular_writers, on='writer')
popular_writers = popular_writers.sort_values(by='averageRating', ascending=False)

threshold_rating = popular_writers['averageRating'].quantile(0.9) ################### se puede modificar el cuantil
top_writers = popular_writers[popular_writers['averageRating'] >= threshold_rating]

In [None]:
top_directors.to_csv('top_directors.csv')
top_writers.to_csv('top_writers.csv')

In [None]:
df = df.drop('averageRating', axis = 1)

In [None]:
def isTop(people, top_people):
    people_list = [person.strip() for person in people.split(',')]
    return int(any(person in top_people for person in people_list))

In [None]:
df_pred['containsTopDirector'] = df['directors'].apply(lambda x: isTop(x, top_directors))
df_pred['containsTopWriter'] = df['writers'].apply(lambda x: isTop(x, top_writers))

In [None]:
df_pred = df_pred.drop('directors', axis = 1)
df_pred = df_pred.drop('writers', axis = 1)
df_pred = df_pred.drop('Unnamed: 0', axis=1)
df_pred = df_pred.drop('attributes', axis=1)

In [None]:
train_target = df_pred['averageRating']
train_target = pd.DataFrame(train_target)
df_pred = df_pred.drop('averageRating', axis = 1)

#### Split

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_pred, train_target, test_size=0.2, random_state=42)

In [None]:
X_train['seasonNumber'] = X_train['seasonNumber'].apply(lambda x: df['seasonNumber'].median() if x > 250 else x)


X_train['popularity'] = X_train['popularity'].apply(lambda x: df['popularity'].median() if x > 50 else x)
X_train['ordering'] = X_train['ordering'].apply(lambda x: df['ordering'].median() if x > 70 else x)


#### Train

In [None]:
xgb_model = xgb.XGBRegressor(objective = "reg:squarederror", random_state = 42, max_depth = 9, n_estimators = 700, learning_rate = 0.25)
xgb_model.fit(X_train, y_train)
xgb_model.score(X_test, y_test)

0.4211115231702556

#### Re-training

In [None]:
df_pred['seasonNumber'] = df_pred['seasonNumber'].apply(lambda x: df['seasonNumber'].median() if x > 250 else x)


df_pred['popularity'] = df_pred['popularity'].apply(lambda x: df['popularity'].median() if x > 50 else x)
df_pred['ordering'] = df_pred['ordering'].apply(lambda x: df['ordering'].median() if x > 70 else x)


In [None]:
xgb_model = xgb.XGBRegressor(objective="reg:squarederror", learning_rate = 0.25, n_estimators = 700, max_depth = 9, random_state=42)
xgb_model.fit(df_pred, train_target)

#### Data processing

In [None]:
res = pd.read_csv("C:/Here/Facultad/Análisis Predictivo/TP2/data/testear.csv")

In [None]:
res_year = res['startYear'].replace(0,res['startYear'].median())
res['age'] = datetime.now().year - res_year
res['age'] = res['age'].astype(float)
res = res.drop('startYear', axis=1)

In [None]:
res = res.drop(excluded_cols, axis=1)

In [None]:
# removing errors in data
res['runtimeMinutes'] = res['runtimeMinutes'].apply(lambda x: 1 if x < 1 else x)
res['isAdult'] = res['isAdult'].apply(lambda x: 0 if x > 1.1 else x) # el .1 es por si llega a haber un floating point error


res['episodeNumber'] = res['episodeNumber'].fillna(res['episodeNumber'].median())
res['episodeNumber'] = res['episodeNumber'].astype(int)
res['episodeNumber'] = res['episodeNumber'].apply(lambda x: 1 if x < 1 else x)
res['episodeNumber'] = np.log10(res['episodeNumber'])


res['seasonNumber'] = res['seasonNumber'].fillna(res['seasonNumber'].median())




res['hasOrdering'] = res['ordering'].notna().astype(int)
res['hasPopularity'] = res['popularity'].notna().astype(int)


res['popularity'] = res['popularity'].fillna(res['popularity'].median())
res['ordering'] = res['ordering'].fillna(res['ordering'].median())

In [None]:
res['hasAttributes'] = res.apply(has_attributes, axis=1)

In [None]:
# reduce categorias, y por ende features

res['titleType'] = res['titleType'].replace('tvEpisode', 'tv')
res['titleType'] = res['titleType'].replace('tvMiniSeries', 'tv')
res['titleType'] = res['titleType'].replace('tvSeries', 'tv')
res['titleType'] = res['titleType'].replace('tvSpecial', 'tv')


res['titleType'] = res['titleType'].replace('tvShort', 'short')
res['titleType'] = res['titleType'].replace('tvMovie', 'movie')

In [None]:
genres_split = res['genres_x'].str.get_dummies(',')
res = res.drop('genres_x', axis=1)

res_cat = res[cat_cols]
res_cat = res_cat.fillna('missing')
res_encoded = pd.get_dummies(res_cat, columns=cat_cols, prefix=cat_cols)



res_pred = pd.concat([res_encoded,genres_split], axis=1)
res_pred = pd.concat([res_pred,res], axis=1)

res_pred = res_pred.drop(res_cat.columns.to_list(), axis=1)

In [None]:
def isTop(people, top_people):
    people_list = [person.strip() for person in people.split(',')]
    return int(any(person in top_people for person in people_list))

In [None]:
res_pred['containsTopDirector'] = res['directors'].apply(lambda x: isTop(x, top_directors))
res_pred['containsTopWriter'] = res['writers'].apply(lambda x: isTop(x, top_writers))

In [None]:
res_pred = res_pred.drop('directors', axis = 1)
res_pred = res_pred.drop('writers', axis = 1)


Id = res_pred['Unnamed: 0']
res_pred = res_pred.drop('Unnamed: 0', axis=1)
res_pred = res_pred.drop('attributes', axis=1)

In [None]:
def preprocess_test_data(test_data, training_data):
    # Drop columns in test_data that are not in training_data
    test_data = test_data.drop(columns=set(test_data.columns) - set(training_data.columns), errors='ignore')

    # Add missing columns in test_data with median values from training_data
    missing_columns = set(training_data.columns) - set(test_data.columns)
    
    for column in missing_columns:
        if column in training_data.columns:
            median_value = training_data[column].median()
            test_data[column] = median_value

    # Reorder columns to match the order in training_data
    test_data = test_data[training_data.columns]

    return test_data

preprocessed_test_data = preprocess_test_data(res_pred, df_pred)

In [None]:
predictions = xgb_model.predict(res_pred)

In [None]:
submission = pd.DataFrame({
    'Id': res['Unnamed: 0'],
    'averageRating': predictions
})

In [None]:
#  submission.to_csv('predictions_xgboost.csv', index=False)