In [8]:
%run data_prep.ipynb

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [9]:
backup_movies_df = movies_df.copy() # Useful when movies_df is accidentally changed below 
movies_df.head()

Unnamed: 0,budget,genres,id,original_title,production_companies,production_countries,release_date,revenue,title,cast,director,keywords
0,30000000,"[Animation, Comedy, Family]",862,Toy Story,[Pixar Animation Studios],[United States of America],1995-10-30,373554033.0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],"[jealousy, toy, boy, friendship, friends, riva..."
1,65000000,"[Adventure, Fantasy, Family]",8844,Jumanji,"[TriStar Pictures, Teitler Film, Interscope Co...",[United States of America],1995-12-15,262797249.0,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston],"[board game, disappearance, based on children'..."
2,0,"[Romance, Comedy]",15602,Grumpier Old Men,"[Warner Bros., Lancaster Gate]",[United States of America],1995-12-22,0.0,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],"[fishing, best friend, duringcreditsstinger, o..."
3,16000000,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,[Twentieth Century Fox Film Corporation],[United States of America],1995-12-22,81452156.0,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker],"[based on novel, interracial relationship, sin..."
4,0,[Comedy],11862,Father of the Bride Part II,"[Sandollar Productions, Touchstone Pictures]",[United States of America],1995-02-10,76578911.0,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer],"[baby, midlife crisis, confidence, aging, daug..."


In [20]:
from sklearn.multioutput import MultiOutputClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

In [15]:
# Replaces column col of lists of labels with a binary matrix. 
def binarize_column(df, col):
    mlb = MultiLabelBinarizer()
    return df.join(pd.DataFrame(
        mlb.fit_transform(df[col]), columns=mlb.classes_, index=df.index), rsuffix="_keyword").drop(col, axis=1)

# Takes pandas Series with lists of labels as values.
# Returns list of labels for which number of occurances > limit.
# Number of labels for 'keywords': > 0 : 18186, > 10 : 2193, > 20 : 1158
def popular_labels(series, limit):
    counts = {}
    for l in series:
        for v in l:
            counts[v] = counts.get(v, 0)+1
    return [k for k,v in counts.items() if v > limit]

# Takes pandas Series with lists of labels as values.
# Creates a new series with only labels that occur at least limit times.
def limit_labels(series, limit):
    new_series = series.copy()
    labels = popular_labels(series, limit)
    for i, l in series.iteritems():
        new_l = [val for val in l if val in labels]
        new_series[i] = new_l
    return new_series

In [16]:
# Transform categorical features into binary matrixes

movies_df = backup_movies_df.copy()

movies_df['production_countries'] = limit_labels(movies_df['production_countries'], 20)
movies_df = binarize_column(movies_df, 'production_countries')
movies_df['keywords'] = limit_labels(movies_df['keywords'], 20)
movies_df = binarize_column(movies_df, 'keywords')

movies_df.head()

Unnamed: 0,budget,genres,id,original_title,production_companies,release_date,revenue,title,cast,director,...,yacht,yakuza,young adult,young boy,young love,youth,zatoichi,zombie,zombie apocalypse,zoo
0,30000000,"[Animation, Comedy, Family]",862,Toy Story,[Pixar Animation Studios],1995-10-30,373554033.0,Toy Story,"[Tom Hanks, Tim Allen, Don Rickles, Jim Varney...",[John Lasseter],...,0,0,0,0,0,0,0,0,0,0
1,65000000,"[Adventure, Fantasy, Family]",8844,Jumanji,"[TriStar Pictures, Teitler Film, Interscope Co...",1995-12-15,262797249.0,Jumanji,"[Robin Williams, Jonathan Hyde, Kirsten Dunst,...",[Joe Johnston],...,0,0,0,0,0,0,0,0,0,0
2,0,"[Romance, Comedy]",15602,Grumpier Old Men,"[Warner Bros., Lancaster Gate]",1995-12-22,0.0,Grumpier Old Men,"[Walter Matthau, Jack Lemmon, Ann-Margret, Sop...",[Howard Deutch],...,0,0,0,0,0,0,0,0,0,0
3,16000000,"[Comedy, Drama, Romance]",31357,Waiting to Exhale,[Twentieth Century Fox Film Corporation],1995-12-22,81452156.0,Waiting to Exhale,"[Whitney Houston, Angela Bassett, Loretta Devi...",[Forest Whitaker],...,0,0,0,0,0,0,0,0,0,0
4,0,[Comedy],11862,Father of the Bride Part II,"[Sandollar Productions, Touchstone Pictures]",1995-02-10,76578911.0,Father of the Bride Part II,"[Steve Martin, Diane Keaton, Martin Short, Kim...",[Charles Shyer],...,0,0,0,0,0,0,0,0,0,0


In [23]:
train_df, test_df = train_test_split(movies_df, test_size=0.2)

X_cols = ['budget', 'revenue'] # For now only numeric features
X_train = train_df[X_cols]
X_test = test_df[X_cols]

# We need to transform 'genres' since this is multi-label classification
mlb = MultiLabelBinarizer()
# TODO: Can it might happen that a genre is in the train set, but not the test set and would it break something?
y_train = mlb.fit_transform(train_df['genres'])
y_test = mlb.fit_transform(test_df['genres'])

genres = []
for x in test_df['genres']:
    for genre in x:
        if genre not in genres:
            genres.append(genre)

In [24]:
# Random forest, only numeric columns
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
y_pred, y_test
accuracy_score(y_test, y_pred)
classification_report(y_test ,y_pred,target_names=genres, output_dict=True )


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Mystery': {'precision': 0.3409090909090909,
  'recall': 0.07871064467766117,
  'f1-score': 0.12789281364190014,
  'support': 1334},
 'Thriller': {'precision': 0.36627906976744184,
  'recall': 0.08786610878661087,
  'f1-score': 0.14173228346456693,
  'support': 717},
 'Drama': {'precision': 0.1276595744680851,
  'recall': 0.015113350125944584,
  'f1-score': 0.02702702702702703,
  'support': 397},
 'Horror': {'precision': 0.35785288270377735,
  'recall': 0.06514657980456026,
  'f1-score': 0.11022657685241885,
  'support': 2763},
 'Comedy': {'precision': 0.15606936416184972,
  'recall': 0.03218116805721097,
  'f1-score': 0.05335968379446641,
  'support': 839},
 'Music': {'precision': 0.1702127659574468,
  'recall': 0.01008827238335435,
  'f1-score': 0.019047619047619046,
  'support': 793},
 'Romance': {'precision': 0.5098425196850394,
  'recall': 0.12637228592339594,
  'f1-score': 0.20254154447702832,
  'support': 4099},
 'Animation': {'precision': 0.19230769230769232,
  'recall': 0.035

In [22]:
# Logistic regression, only numeric columns
clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
classification_report(y_test ,y_pred,target_names=genres, output_dict=True )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Drama': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 1345},
 'History': {'precision': 0.10106382978723404,
  'recall': 0.07818930041152264,
  'f1-score': 0.08816705336426915,
  'support': 729},
 'Comedy': {'precision': 0.046511627906976744,
  'recall': 0.05759162303664921,
  'f1-score': 0.05146198830409356,
  'support': 382},
 'Horror': {'precision': 0.38922155688622756,
  'recall': 0.07088331515812432,
  'f1-score': 0.11992619926199262,
  'support': 2751},
 'Mystery': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 881},
 'Thriller': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 774},
 'Family': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 4174},
 'Action': {'precision': 0.07385229540918163,
  'recall': 0.06666666666666667,
  'f1-score': 0.07007575757575758,
  'support': 555},
 'Foreign': {'precision': 0.0673469387755102,
  'recall': 0.06903765690376569,
  'f1-score': 0.06818181818181818,
  'support': 478},
 'TV M

In [25]:
X_cols_to_drop = ['genres', 'id', 'original_title', 'production_companies', 
                  'release_date', 'title', 'cast', 'director']
X_train = train_df.drop(columns=X_cols_to_drop, axis=1)
X_test = test_df.drop(columns=X_cols_to_drop, axis=1)

In [26]:
# Random forest with limited keywords and production_countries
rfc = RandomForestClassifier()
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)
accuracy_score(y_test, y_pred)
classification_report(y_test ,y_pred,target_names=genres, output_dict=True )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Mystery': {'precision': 0.5856643356643356,
  'recall': 0.25112443778110943,
  'f1-score': 0.35152151101783835,
  'support': 1334},
 'Thriller': {'precision': 0.5380434782608695,
  'recall': 0.13807531380753138,
  'f1-score': 0.21975582685904552,
  'support': 717},
 'Drama': {'precision': 0.6641221374045801,
  'recall': 0.21914357682619648,
  'f1-score': 0.3295454545454546,
  'support': 397},
 'Horror': {'precision': 0.5652866242038217,
  'recall': 0.25696706478465436,
  'f1-score': 0.35332172182134863,
  'support': 2763},
 'Comedy': {'precision': 0.5018181818181818,
  'recall': 0.16448152562574495,
  'f1-score': 0.24775583482944347,
  'support': 839},
 'Music': {'precision': 0.5260416666666666,
  'recall': 0.1273644388398487,
  'f1-score': 0.2050761421319797,
  'support': 793},
 'Romance': {'precision': 0.6269430051813472,
  'recall': 0.501829714564528,
  'f1-score': 0.5574525745257453,
  'support': 4099},
 'Animation': {'precision': 0.5338345864661654,
  'recall': 0.124343257443082

In [27]:
# Logistic regression with limited keywords and production_countries
X_train.head()
clf = MultiOutputClassifier(LogisticRegression()).fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)
classification_report(y_test ,y_pred,target_names=genres, output_dict=True )

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'Mystery': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 1334},
 'Thriller': {'precision': 0.09251968503937008,
  'recall': 0.06555090655509066,
  'f1-score': 0.07673469387755102,
  'support': 717},
 'Drama': {'precision': 0.05660377358490566,
  'recall': 0.060453400503778336,
  'f1-score': 0.0584652862362972,
  'support': 397},
 'Horror': {'precision': 0.31026785714285715,
  'recall': 0.05030763662685487,
  'f1-score': 0.08657739022111492,
  'support': 2763},
 'Comedy': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 839},
 'Music': {'precision': 0.0, 'recall': 0.0, 'f1-score': 0.0, 'support': 793},
 'Romance': {'precision': 0.0,
  'recall': 0.0,
  'f1-score': 0.0,
  'support': 4099},
 'Animation': {'precision': 0.07964601769911504,
  'recall': 0.06304728546409807,
  'f1-score': 0.07038123167155426,
  'support': 571},
 'Crime': {'precision': 0.07621247113163972,
  'recall': 0.06457925636007827,
  'f1-score': 0.06991525423728813,
  'support': 511},
