In [1]:
# Import all needed dependencies

import numpy as np
import pandas as pd
import math
from xgboost import XGBRegressor

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, average_precision_score
from sklearn.model_selection import train_test_split
from category_encoders import TargetEncoder

In [2]:
# Create project variables for training and test

df_dataset = pd.read_csv("./netflix_titles_2.csv")
df_to_predict_and_submit = pd.read_csv("./netflix_titles_2.csv")

In [3]:
# Try to display data

df_dataset.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description,UserScore
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm...",0.00076
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t...",0.05743
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...,0.000747
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo...",0.016059
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...,0.005066


In [4]:
# See how many lines do not have a cast

df_dataset['cast'].isna().sum()

825

In [5]:
# Set needed and unwanted columns

selected_features = ['type', 'country', 'release_year', 'duration', 'listed_in', 'UserScore']
removed_features = ['show_id', 'director', 'title', 'cast', 'date_added', 'rating', 'description']

In [6]:
# Trim columns

df_trimed = df_dataset.drop(columns=removed_features)
df_trimed.head()

Unnamed: 0,type,country,release_year,duration,listed_in,UserScore
0,Movie,United States,2020,90 min,Documentaries,0.00076
1,TV Show,South Africa,2021,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries",0.05743
2,TV Show,,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act...",0.000747
3,TV Show,,2021,1 Season,"Docuseries, Reality TV",0.016059
4,TV Show,India,2021,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",0.005066


In [7]:
# Trim null values

#for feature in selected_features :
#    df_dataset[feature] = df_dataset[feature].fillna('')

In [8]:
# Combine selected features

#combined_features = ''
#for feature in selected_features :
#    combined_features += ' '+df_dataset[feature]
#print(combined_features)

# Transformer données :

**OHE =** OneHotEncoding   ---------- **TE =** TargetEncoding

- **type :** passer en 0 (movie) et 1 (tv show)
- **director :** laisser tel quel
- **cast :** limiter aux deux premiers noms PUIS transformer en vecteur de plusieurs colonnes (TE)
- **country :** transformer en vecteur de plusieurs colonnes (OHE)
- **release year :** laisser tel quel
- **duration :** transformer "X season(s)" en "XX min" puis tous les "XX min" en "XX"
- **listed_in :** limiter aux trois premières PUIS transformer en vecteur de plusieurs colonnes (TE)

In [9]:
#te = TargetEncoder()
#encoded_listed_in = te.fit_transform(df_trimed['listed_in'], df_trimed['Target'])
#encoded_listed_in

#tags = df_trimed['listed_in'].str.split(',', expand = True)
#tags = tags.add_prefix('tag_')
#df_tried = pd.concat([df_trimed, tags], axis=1)
#tags_encoder = TargetEncoder(cols=tags.columns)
#encoded_tags = pd.Series(dtype=float)
#df_encoded = tags_encoder.fit_transform(df_trimed, encoded_tags)
#df_encoded

In [10]:
# Movie -> 0 ; TV Show -> 1

#mask = df_trimed["type"] == "Movie"
#df_trimed[mask]["type"] = 0
#df_trimed[~mask]['type'] = 1

In [11]:
# Comme le masque mais en plus poussé

def movie_tvShow_filter(x):
    if x["type"] == "Movie" :
        return 0
    else :
        return 1

df_trimed["type_float"] = df_trimed.apply(movie_tvShow_filter, axis = 1)
df_trimed = df_trimed.drop("type", axis = 1)

In [12]:
# Duration edit (1 season = 500min)

def formatDuration(x):
    res_str = str(x["duration"])
    
    if(res_str == "nan") :
        return 326
    else :
        if x["type_float"] == 0 :
            return int(res_str[:-4])
        else :
            return int(res_str[0]) * 500

df_trimed["duration_float"] = df_trimed.apply(formatDuration, axis = 1)
df_trimed = df_trimed.drop("duration", axis = 1)

In [13]:
# Countries edit

#def selectFirstCountry(x):
    
#df_trimed["countries_float"] = df_trimed.apply(selectFirstCountry)

#one_hot = pd.get_dummies(df_trimed['country'])
#df_trimed = pd.concat([df_trimed, one_hot], axis=1)
#df_trimed = df_trimed.drop("country", axis = 1)

In [14]:
res = []
for cell in df_dataset["country"]:
    if str(cell) != 'nan' :
        for country in str(cell).split(", "):
            res.append(country)
set(res)

countries = list(set(res))
countries_columns = pd.DataFrame(columns=countries, index = df_trimed.index)
countries_columns = countries_columns.fillna(0)
df_trimed = pd.concat([df_trimed, countries_columns], axis = 1)

In [15]:
for index, row in df_trimed.iterrows():
    if str(row["country"]) != 'nan' :
        for country in str(row["country"]).split(", "):
            df_trimed.at[index, country] = 1

In [16]:
res = []
for cell in df_dataset["listed_in"]:
    if str(cell) != 'nan' :
        for genre in str(cell).split(", "):
            res.append(genre)
set(res)

genres = list(set(res))
genres_columns = pd.DataFrame(columns=genres, index = df_trimed.index)
genres_columns = genres_columns.fillna(0)
df_trimed = pd.concat([df_trimed, genres_columns], axis = 1)

In [17]:
for index, row in df_trimed.iterrows():
    if str(row["listed_in"]) != 'nan':
        for genre in row["listed_in"].split(", "):
            df_trimed.at[index, genre] = 1

In [18]:
df_trimed = df_trimed.drop(columns=["listed_in", "country"])
df_trimed

Unnamed: 0,release_year,UserScore,type_float,duration_float,Unnamed: 5,Denmark,Nepal,Morocco,Sweden,Cuba,...,Crime TV Shows,Action & Adventure,Science & Nature TV,Classic & Cult TV,Dramas,Faith & Spirituality,TV Sci-Fi & Fantasy,Stand-Up Comedy,TV Action & Adventure,Romantic TV Shows
0,2020,0.000760,0,90,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2021,0.057430,1,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2021,0.000747,1,500,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,1,0
3,2021,0.016059,1,500,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2021,0.005066,1,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8802,2007,0.011004,0,158,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
8803,2018,0.002564,1,1000,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8804,2009,0.150050,0,88,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8805,2006,0.760169,0,88,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Train test data

In [19]:
#scikit-learn
#X, y = np.arange(10).reshape((5, 2)), range(5)

X = df_trimed.drop("UserScore", axis=1)
y = df_trimed["UserScore"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=42)

## IA

In [21]:
#xgboost
model = XGBRegressor()
model.fit(X_train, y_train)

In [22]:
predict = model.predict(X_test)
print(predict)

[ 0.49869156  0.09436858  0.1106285  ... -0.00181017  0.00265003
  1.628878  ]


In [24]:
model.save_model("model.json")