In [91]:
import pandas as pd

In [92]:
df = pd.read_csv("genre_titles.csv")

In [93]:
df.columns = ["title", "genre", "year", "length", "pulp"]
df

Unnamed: 0,title,genre,year,length,pulp
0,Beyond the Black River,Weird,1935.0,SS,yes
1,The Hour of the Dragon,Weird,1935.0,novel,yes
2,The People of the Black Circle,Weird,1934.0,novel,yes
3,Red Nails,Weird,1936.0,novel,yes
4,Queen of the Black Coast,Weird,1934.0,SS,yes
5,A Witch Shall Be Born,Weird,1934.0,novel,yes
6,The Devil in Iron,Weird,1934.0,SS,yes
7,Shadows in the Moonlight,Weird,1934.0,SS,yes
8,Shadows in Zamboula,Weird,1935.0,SS,yes
9,Jewels of Gwahlur,Weird,1935.0,SS,yes


In [13]:
import requests 
import enchant

words = requests.get('http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words')

#DictVectorizer
#{word:0, word2: 1}
titles = list(df['title'])

#tokenize, lowercase, count, remove punctuation

stoplist1 = words.text.split("\r\n")

from nltk.corpus import stopwords
stoplist2 = set(stopwords.words('english'))

stoplist1.extend(stoplist2)

fullstops = list(set(stoplist1))
def remove_stops(stoplist, wordlist):
    result = []
    for i in wordlist:
        if i not in stoplist:
                result.append(i)
    return result

def spellcheck(wordlist):
    result = []
    d = enchant.Dict("en_US")
    for i in wordlist:
        if d.check(i) or d.check(i.capitalize()):
            result.append(i)
         
    return result

def clean_text(list_of_texts):
    fully_cleaned =[]
    #normalize ocr errors
    for i in list_of_texts:
        #lowercase all
        ocr_lower = i.lower()
        #tokenize, remove punctuation and numbers, remove tabs, newlines, etc.
        ocr_cleaner = ocr_lower.replace("\n", " ").replace("\t", " ")
        ocr_tokens = ocr_cleaner.split(" ")
        
        no_numbers_or_punct = []
        for token in ocr_tokens:
            if token.isalpha():
                no_numbers_or_punct.append(token)
            else:
                
                new_token = ""
                for letter in token:
                    if letter.isalpha():
                        new_token += letter
                if new_token != "":
                    no_numbers_or_punct.append(new_token)  
        #almost_ready_before_spellcheck = remove_stops(fullstops, no_numbers_or_punct)
        #almost_ready = spellcheck(almost_ready_before_spellcheck)
        almost_ready = spellcheck(no_numbers_or_punct)
        from nltk.stem import WordNetLemmatizer
        #lemmatization
        wordnet_lemmatizer = WordNetLemmatizer()
        lemmas = []
        for token in almost_ready:
            lemma = wordnet_lemmatizer.lemmatize(token)
            lemmas.append(lemma)
        
        ready = [i for i in lemmas if len(i) > 2]
        fully_cleaned.append(ready)
    return fully_cleaned

clean_titles = clean_text(titles)
clean_titles[0]

['beyond', 'the', 'black', 'river']

In [15]:
from collections import Counter
title_counts = [dict(Counter(i)) for i in clean_titles]
title_counts[0]

{'beyond': 1, 'black': 1, 'river': 1, 'the': 1}

In [18]:
def dict_to_binary(dictionary):
    p = dict(dictionary)
    for i in dictionary.keys():
        p[i] = 1
    return p
title_binary = [dict_to_binary(i) for i in title_counts]
title_binary[0]

{'beyond': 1, 'black': 1, 'river': 1, 'the': 1}

In [44]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
X = v.fit_transform(title_binary)

In [45]:
v.vocabulary_
#black
X[0][36]

1.0

In [46]:
is_crime = [1 if i.lower().strip() == "crime" else 0 for i in list(df['genre'])]
is_horror = [1 if i.lower().strip() == "horror" else 0 for i in list(df['genre'])]
is_scifi = [1 if i.lower().strip() == "science fiction" else 0 for i in list(df['genre'])]

In [47]:
import random as r
positions_train = []
for i in range(50):
    pos = r.randint(0,len(is_crime))
    positions_train.append(pos)

In [48]:
is_crime_train = [is_crime[i] for i in positions_train]
vects_train = [X[i] for i in positions_train]
positions_test = []
for i,j in enumerate(is_crime):
    if i not in positions_train:
        positions_test.append(i)

In [74]:
is_crime_test = [is_crime[i] for i in positions_test]
vects_test = [X[i] for i in positions_test]
#X_train = vects_train
#y_train = is_crime_train
#X_test = vects_test
#y_test = is_crime_test

In [75]:
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.calibration import calibration_curve

# Create classifiers
lr = LogisticRegression()
gnb = GaussianNB()
svc = LinearSVC(C=1.0)
rfc = RandomForestClassifier(n_estimators=100)


In [100]:
mytuples = list(zip(is_crime, is_horror, is_scifi, list(df['title'])))
mytuples

[(0, 0, 0, 'Beyond the Black River'),
 (0, 0, 0, 'The Hour of the Dragon'),
 (0, 0, 0, 'The People of the Black Circle'),
 (0, 0, 0, 'Red Nails'),
 (0, 0, 0, 'Queen of the Black Coast'),
 (0, 0, 0, 'A Witch Shall Be Born'),
 (0, 0, 0, 'The Devil in Iron'),
 (0, 0, 0, 'Shadows in the Moonlight'),
 (0, 0, 0, 'Shadows in Zamboula'),
 (0, 0, 0, 'Jewels of Gwahlur'),
 (0, 1, 0, 'Tiger Cat '),
 (0, 1, 0, 'The Secret of Kralitz'),
 (0, 0, 1, 'Old Mr. Wiley'),
 (0, 0, 1, 'The Mississippi Saucer'),
 (0, 1, 0, 'The Shunned House'),
 (0, 1, 0, 'The Dunwich Horror'),
 (0, 0, 1, 'The Tree of Life'),
 (0, 1, 0, 'The Masque of the Red Death'),
 (0, 1, 0, 'The Fall of the House of Usher'),
 (0, 1, 0, 'The Lost Door'),
 (0, 1, 0, 'Pledged to the Dead'),
 (0, 1, 0, "Dracula's Guest"),
 (0, 1, 0, 'The Golgotha Dancers'),
 (0, 0, 1, 'The Purple Cloud '),
 (0, 1, 0, 'The House of Sounds'),
 (0, 1, 0, 'Xélucha'),
 (0, 1, 0, 'Picture of Dorian Gray'),
 (0, 1, 0, 'The Lair of the White Worm'),
 (0, 1, 0, 'The

In [160]:
from sklearn.model_selection import train_test_split

shuffled = list(zip(X, mytuples))
r.shuffle(shuffled)
X = [i[0] for i in shuffled]
y = [i[1] for i in shuffled]

X_train, X_test, y_train_tuples, y_test_tuples = train_test_split(X, y, test_size=0.5, random_state=42)

import itertools

y_pred = lr.fit(X_train, [i[1] for i in y_train_tuples]).predict(X_test)
y_prob = lr.predict_proba(X_test)

df_results = pd.DataFrame([i[3] for i in y_test_tuples])
df_results['really scifi'] = [i[2] for i in y_test_tuples]  
df_results['scifi_probability'] = [i[0] for i in y_prob]
df_results = df_results.sort_values(by='scifi_probability')
probs = list(df_results['scifi_probability'])

not_crime = [0 for i in probs[:int(len(probs)/2)]]
crime = [1 for i in probs[int(len(probs)/2):]]
not_crime.extend(crime)
len(probs), len(not_crime)
df_results['preds'] = not_crime
#df_results.to_csv("scifi_vs_all_lr.csv")
df_results

Unnamed: 0,0,really scifi,scifi_probability,preds
94,The Amphibians,1,0.465449,0
128,The Exile of the Skies,1,0.513466,0
100,The Lipstick Clue,0,0.515030,0
105,The Three Impostors,0,0.516502,0
44,The Man-Wolf,0,0.516502,0
102,Messenger to Infinity,1,0.563148,0
143,The prince of peril,1,0.570524,0
55,The angel of the revolution,1,0.571397,0
173,Horla,0,0.595853,0
0,The House of Sounds,0,0.595853,0


In [161]:
from sklearn.metrics import accuracy_score
#accuracy_score(df_results['really scifi'], df_results['preds'])
accuracy_score(df_results['really scifi'], y_pred)

0.60752688172043012

In [81]:
from sklearn.metrics import confusion_matrix

def plot_confusion_matrix(cm, classes,
                          normalize=False,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

# Compute confusion matrix
cnf_matrix = confusion_matrix(y_test, y_pred)
np.set_printoptions(precision=2)

# Plot non-normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["crime", "not crime"],
                      title='Confusion matrix, without normalization')

# Plot normalized confusion matrix
plt.figure()
plot_confusion_matrix(cnf_matrix, classes=["crime", "not crime"], normalize=True,
                      title='Normalized confusion matrix')

plt.show()

[0,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 1,
 1,
 1,
 1,
 0,
 1,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 1,
 1,
 1,
 0,
 0,
 0,
 1,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 1,
 0,
 1,
 0,
 0,
 0,
 0,
 0,


In [None]:
% matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
plt.figure(figsize=(10, 10))
ax1 = plt.subplot2grid((3, 1), (0, 0), rowspan=2)
ax2 = plt.subplot2grid((3, 1), (2, 0))

ax1.plot([0, 1], [0, 1], "k:", label="Perfectly calibrated")
for clf, name in [(lr, 'Logistic'),
                  (gnb, 'Naive Bayes'),
                  (svc, 'Support Vector Classification'),
                  (rfc, 'Random Forest')]:
    clf.fit(X_train, y_train)
    if hasattr(clf, "predict_proba"):
        prob_pos = clf.predict_proba(X_test)[:, 1]
    else:  # use decision function
        prob_pos = clf.decision_function(X_test)
        prob_pos = \
            (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
    fraction_of_positives, mean_predicted_value = \
        calibration_curve(y_test, prob_pos, n_bins=10)

    ax1.plot(mean_predicted_value, fraction_of_positives, "s-",
             label="%s" % (name, ))

    ax2.hist(prob_pos, range=(0, 1), bins=10, label=name,
             histtype="step", lw=2)

ax1.set_ylabel("Fraction of positives")
ax1.set_ylim([-0.05, 1.05])
ax1.legend(loc="lower right")
ax1.set_title('Calibration plots  (reliability curve)')

ax2.set_xlabel("Mean predicted value")
ax2.set_ylabel("Count")
ax2.legend(loc="upper center", ncol=2)

plt.tight_layout()
plt.show()