In [1]:
%run data_prep.ipynb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [2]:
backup_movies_df = movies_df.copy() # Useful when movies_df is accidentally changed below 
movies_df.head()

Unnamed: 0,budget,genres,id,original_title,production_companies,production_countries,release_date,revenue,title,cast,director,keywords
0,30000000,"[Animation, Comedy, Family]",862,Toy Story,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[jealousy, toy, boy, friendship, friends, riva..."
1,65000000,"[Adventure, Fantasy, Family]",8844,Jumanji,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston],"[board game, disappearance, based on children'..."
2,0,"[Romance, Comedy]",15602,Grumpier Old Men,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],"[fishing, best friend, duringcreditsstinger, o..."
3,16000000,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker],"[based on novel, interracial relationship, sin..."
4,0,[Comedy],11862,Father of the Bride Part II,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer],"[baby, midlife crisis, confidence, aging, daug..."


In [3]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [6]:
# Replaces column col of lists of labels with a binary matrix. 
def binarize_column(df, col):
    mlb = MultiLabelBinarizer()
    return df.join(pd.DataFrame(
        mlb.fit_transform(df[col]), columns=mlb.classes_, index=df.index), rsuffix="_keyword").drop(col, axis=1)

# Takes pandas Series with lists of labels as values.
# Returns list of labels for which number of occurances > limit.
# Number of labels for 'keywords': > 0 : 18186, > 10 : 2193, > 20 : 1158
def popular_labels(series, limit):
    counts = {}
    for l in series:
        for v in l:
            counts[v] = counts.get(v, 0)+1
    return [k for k,v in counts.items() if v > limit]

# Takes pandas Series with lists of labels as values.
# Creates a new series with only labels that occur at least limit times.
def limit_labels(series, limit):
    new_series = series.copy()
    labels = popular_labels(series, limit)
    for i, l in series.iteritems():
        new_l = [val for val in l if val in labels]
        new_series[i] = new_l
    return new_series

In [33]:
# Transform categorical features into binary matrixes

movies_df = backup_movies_df.copy()

movies_df['production_countries'] = limit_labels(movies_df['production_countries'], 20)
movies_df = binarize_column(movies_df, 'production_countries')
movies_df['keywords'] = limit_labels(movies_df['keywords'], 20)
movies_df = binarize_column(movies_df, 'keywords')

movies_df.head()

Unnamed: 0,budget,genres,id,original_title,production_companies,release_date,revenue,title,cast,director,...,yacht,yakuza,young adult,young boy,young love,youth,zatoichi,zombie,zombie apocalypse,zoo
0,30000000,"[Animation, Comedy, Family]",862,Toy Story,[Pixar Animation Studios],1995-10-30,373554033.0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],...,0,0,0,0,0,0,0,0,0,0
1,65000000,"[Adventure, Fantasy, Family]",8844,Jumanji,"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston],...,0,0,0,0,0,0,0,0,0,0
2,0,"[Romance, Comedy]",15602,Grumpier Old Men,"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],...,0,0,0,0,0,0,0,0,0,0
3,16000000,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,[Twentieth Century Fox Film Corporation],1995-12-22,81452156.0,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker],...,0,0,0,0,0,0,0,0,0,0
4,0,[Comedy],11862,Father of the Bride Part II,"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer],...,0,0,0,0,0,0,0,0,0,0


In [38]:
train_df, test_df = train_test_split(movies_df, test_size=0.2)

X_cols = ['budget', 'revenue'] # For now only numeric features
X_train = train_df[X_cols]
X_test = test_df[X_cols]

# We need to transform 'genres' since this is multi-label classification
mlb = MultiLabelBinarizer()
# TODO: Can it might happen that a genre is in the train set, but not the test set and would it break something?
y_train = mlb.fit_transform(train_df['genres'])
y_test = mlb.fit_transform(test_df['genres'])

In [39]:
# Random forest, only numeric columns
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_pred, y_test
accuracy_score(y_test, y_pred)

0.0685431886549205

In [40]:
# Logistic regression, only numeric columns
clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.05511388053287495

In [49]:
X_cols_to_drop = ['genres', 'id', 'original_title', 'production_companies', 
                  'release_date', 'title', 'cast', 'director']
X_train = train_df.drop(columns=X_cols_to_drop, axis=1)
X_test = test_df.drop(columns=X_cols_to_drop, axis=1)

In [47]:
# Random forest with limited keywords and production_countries
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)

0.15975504941985388

In [52]:
# Logistic regression with limited keywords and production_countries
X_train.head()
clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.05511388053287495