In [1]:
# Standard imports
import numpy as np
import pandas as pd
pd.set_option('display.max_columns', 500)
import matplotlib.pyplot as plt
import joblib
import re

In [2]:
train = pd.read_pickle('data/train_df.pkl')
test = pd.read_pickle('data/test_df.pkl')

In [3]:
cols = list(train.columns.values)
genre_cols = cols[-6:]
print(len(genre_cols))
print(genre_cols)

6
['Abstract', "Children's", 'Family', 'Strategy', 'Thematic', 'Wargames']


In [4]:
X_train = train[train.columns[~train.columns.isin(genre_cols)]]
y_train = train[train.columns[ train.columns.isin(genre_cols)]]

X_test = test[test.columns[~test.columns.isin(genre_cols)]]
y_test = test[test.columns[ test.columns.isin(genre_cols)]]

In [5]:
from sklearn.preprocessing import StandardScaler
my_standard_scaler = StandardScaler().fit(X_train)
X_train_s = my_standard_scaler.transform(X_train)
X_test_s = my_standard_scaler.transform(X_test)

joblib.dump(my_standard_scaler, 'data/scaler.pkl')

['data/scaler.pkl']

In [6]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression

In [7]:
from sklearn.model_selection import cross_val_score
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=123, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1)

scores = cross_val_score(my_log_model, X_train_s, y_train, cv = 5)
print(scores)

for i in range(len(scores)) :
    print(f"Fold {i+1}: {scores[i]}")
print(f"Average Score:{np.mean(scores)}")

[0.65335753 0.65263158 0.63798112 0.65395788 0.65686275]
Fold 1: 0.6533575317604355
Fold 2: 0.6526315789473685
Fold 3: 0.6379811183732752
Fold 4: 0.6539578794480755
Fold 5: 0.6568627450980392
Average Score:0.6509581707254387


In [8]:
my_log_model = OneVsRestClassifier(LogisticRegression(random_state=42, solver='lbfgs', max_iter=3000, C=0.01, n_jobs=-1), n_jobs=-1).fit(X_train_s, y_train)

In [9]:
y_train_pred = my_log_model.predict(X_train_s)
y_train_proba = my_log_model.predict_proba(X_train_s)
y_test_pred = my_log_model.predict(X_test_s)
y_test_proba = my_log_model.predict_proba(X_test_s)

In [10]:
from sklearn.metrics import accuracy_score
print(f'Training score: {accuracy_score(y_train, y_train_pred):0.5f}')
print(f'    Test score: {accuracy_score(y_test, y_test_pred):0.5f}')

Training score: 0.99332
    Test score: 0.64496


In [11]:
y_pred_df = pd.DataFrame(y_test_pred, columns=genre_cols)

# Test set predictions
for g in genre_cols:
    score = accuracy_score(y_test[g], y_pred_df[g])
    print(f'{score:0.4f}  {g}')

0.9072  Abstract
0.9203  Children's
0.8647  Family
0.9090  Strategy
0.9386  Thematic
0.9591  Wargames


In [12]:
from sklearn.metrics import classification_report

print(classification_report(y_test,y_pred_df))

              precision    recall  f1-score   support

           0       0.83      0.77      0.80      1096
           1       0.84      0.78      0.81       990
           2       0.48      0.43      0.45       596
           3       0.75      0.61      0.67       699
           4       0.64      0.49      0.56       359
           5       0.94      0.90      0.92      1231

   micro avg       0.80      0.72      0.76      4971
   macro avg       0.74      0.66      0.70      4971
weighted avg       0.79      0.72      0.75      4971
 samples avg       0.73      0.75      0.73      4971



  _warn_prf(average, modifier, msg_start, len(result))


In [13]:
y_test_proba

array([[4.87273445e-04, 4.69049621e-04, 4.17110538e-01, 8.15159215e-05,
        3.26383364e-04, 9.70374924e-01],
       [4.68053096e-01, 4.57538935e-01, 2.96710033e-02, 8.12611170e-05,
        2.65019157e-04, 3.81127815e-01],
       [3.20335912e-05, 8.55501500e-05, 3.18898002e-01, 9.99993935e-01,
        1.89086289e-03, 4.44569180e-04],
       ...,
       [9.94382525e-01, 4.58926277e-04, 8.09390619e-02, 1.12788407e-03,
        1.06607169e-03, 1.55441895e-03],
       [9.68680690e-01, 9.96152012e-02, 6.00175457e-05, 6.90522019e-04,
        6.36489901e-07, 5.51254774e-03],
       [2.17251411e-05, 6.95235108e-02, 9.30828934e-01, 1.54215375e-02,
        9.81727958e-01, 8.19274429e-05]])

In [14]:
y_test

Unnamed: 0,Abstract,Children's,Family,Strategy,Thematic,Wargames
11809,0,0,0,0,0,1
3993,0,1,0,0,0,0
7834,0,0,0,1,0,0
4708,0,1,0,0,0,0
2993,0,1,0,0,0,0
...,...,...,...,...,...,...
13266,0,0,0,0,0,1
11859,0,0,0,0,0,1
16534,1,0,0,0,0,0
15123,1,0,0,0,0,0


In [15]:
my_tfidf = joblib.load('data/my_tfidf_min10.pkl')
my_scaler = joblib.load('data/scaler.pkl')

In [16]:
my_string = "Theme Players take the part of land owners, attempting to buy and then develop their land. Income is gained by other players visiting their properties and money is spent when they visit properties belonging to other players. When times get tough, players may have to mortgage their properties to raise cash for fines, taxes and other misfortunes. Gameplay On his turn, a player rolls two dice and moves that number of spaces around the board. If the player lands on an as-yet-unowned property, he has the opportunity to buy it and add it to his portfolio or allow the bank to auction it to the highest bidder. If a player owns all the spaces within a color group, he may then build houses and hotels on these spaces, generating even more income from opponents who land there. If he lands on a property owned by another player, he must pay that player rent according to the value of the land and any buildings on it. There are other places on the board which can not be bought, but instead require the player to draw a card and perform the action on the card, pay taxes, collect income, or even go to jail. Goal The goal of the game is to be the last player remaining with any money. Cultural impact on rules Monopoly is unusual in that the game has official, printed rules, but most players learn how to play from others, never actually learning the correct way to play. This has led to the canonization of a number of house rules that make the game more palatable to children (and sore losers) but harm the gameplay by preventing players from going bankrupt or slowing down the rate of property acquisition. One common house rule has players put any money paid to the bank in the center of the board, which jackpot a player may earn by landing on Free Parking. This prevents the game from removing money from play, and since players collect $200 each time they pass Go, this results in ever-increasing bankrolls and players surviving rents that should have bankrupted them. Another house rule allows players to take loans from the bank instead of going bankrupt, which means the game will never end. Some house rules arise out of ignorance rather than attempts to improve the game. For instance, many players don't know that properties landed on but left unbought go up for auction, and even some that know to auction don't know that the bidding starts at $1, meaning a player may pay well below the listed price for an auctioned property. Background In the USA in 1933, Charles Darrow devised Monopoly based on an earlier game by Elizabeth J. Magie. The patent was filed 31st August 1935 while the game was on sale in America. Based on an earlier game, The Landlord's Game, it was at first rejected by Parker Bros., as being too complicated to be a success. How wrong could they be! It came to the UK in 1936, made under licence by Waddingtons. Darrow died in 1967 having realised he had developed one of the most successful board games of all times. It was awarded as Game of the Century by the TRA (Toy Retailers Association). Monopoly was patented in 1935 by Charles Darrow and released by Parker Brothers. The game was actually one of a number of variants in existence at the time, all of which date back to an earlier, 1904 game by Elizabeth J. Magie called The Landlord's Game. Magie was a proponent of the Single Tax put forth by famous author Henry George. The game was designed to show the evils of earning money from renting land (as it leads to the destitution of all but one player) and the virtues of the proposed Single Tax - players could choose to play under regular rules or alternate Single Tax rules. The game didn't really go anywhere and Magie lost interest in it. Variations of the game evolved, however, and homemade versions traveled up and down the Atlantic coast and even as far west as Michigan and Texas, being developed all along the way. Eventually the game was noticed by Charles Darrow, who introduced it to the world in its current form."

In [17]:
def clean_desc(raw_html):
    clean = re.sub("[^a-zA-Z]"," ",raw_html)
    clean = ' '.join(clean.split())
    clean = clean.lower()
    return clean

In [18]:
my_string = clean_desc(my_string)

In [19]:
from nltk.corpus import stopwords
from nltk import word_tokenize, PorterStemmer, WordNetLemmatizer


def tokenizer(text):
    """
    Tokenizes the document
    """
    return word_tokenize(text)

#Load up our stop words
stop_words = stopwords.words('english')
#Adds stuff to our stop words list
stop_words.extend(['.',','])

## This function can improve, simplify. Look into Text Data Lecture
def remove_stopwords(list_of_tokens):
    """
    Removes stopwords
    """
    cleaned_tokens = []

    for token in list_of_tokens:
        if token in stop_words: continue
        cleaned_tokens.append(token)

    return cleaned_tokens

def stemmer(list_of_tokens):
    '''
    Takes in an input which is a list of tokens, and spits out a list of stemmed tokens.
    '''

    stemmed_tokens_list = []

    for i in list_of_tokens:

        token = PorterStemmer().stem(i)
        stemmed_tokens_list.append(token)

    return stemmed_tokens_list

def lemmatizer(list_of_tokens):

    lemmatized_tokens_list = []

    for i in list_of_tokens:
        token = WordNetLemmatizer().lemmatize(i)
        lemmatized_tokens_list.append(token)

    return lemmatized_tokens_list


def the_untokenizer(token_list):
    '''
    Returns all the tokenized words in the list to one string.
    Used after the pre processing, such as removing stopwords, and lemmatizing.
    '''
    return " ".join(token_list)

def clean_string(my_string):
    tokenized_list = word_tokenize(my_string)
    removed_stopwords = remove_stopwords(tokenized_list)
    stemmed_words = stemmer(removed_stopwords)
    lemmatized_words = lemmatizer(stemmed_words)
    back_to_string = the_untokenizer(lemmatized_words)
    return back_to_string

In [20]:
input_tfidf = my_tfidf.transform([clean_string(my_string)])

In [21]:
input_transformed_df = pd.DataFrame(input_tfidf.toarray(), columns=my_tfidf.get_feature_names())

In [22]:
input_final_df = my_scaler.transform(input_transformed_df)

In [23]:
input_pred = my_log_model.predict_proba(input_final_df)

In [24]:
input_pred

array([[1.95593726e-02, 1.55674133e-04, 4.85605699e-01, 3.28340610e-01,
        1.20203289e-03, 2.08759836e-04]])

In [26]:
joblib.dump(my_log_model, 'data/my_best_model.pkl')

['data/my_best_model.pkl']

In [27]:
stopwords

<WordListCorpusReader in 'C:\\Users\\Noah\\AppData\\Roaming\\nltk_data\\corpora\\stopwords'>