In [1]:
import networkx as nx
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import string
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer
%matplotlib inline

lancaster=LancasterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\olive\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sentences = []
with open(r"./AllsWellThatEndsWell.txt", 'r') as f:
    for line in f.read().split('\n'):
        if line.upper() != line:
            sentences.append(line)
shks_index = len(sentences)
print("Shakespear Index: {} to {}".format(0, shks_index))
with open(r"./lyric_data.txt", 'r', encoding='utf-8') as f:
    for line in f.read().split('\n'):
        if len(line)==0:
            continue
        if line[0] != '[':
            sentences.append(line)
print("Migos Index: {} to {}".format(shks_index, len(sentences)))

Shakespear Index: 0 to 3329
Migos Index: 3329 to 24993


In [3]:
def diedge_exists(a, b):
    # direction matters!
    try:
        return b in SEN.neighbors(a)
    except nx.NetworkXError:
        return False

In [4]:
data = []
for sentence in sentences:
    sentence = sentence.translate(str.maketrans('', '', string.punctuation))
    words = sentence.lower().split()
    words = list(filter(lambda word: word not in stopwords.words('english'), words))
    words = [lancaster.stem(word) for word in words]
    SEN = nx.DiGraph()
    for i in range(len(words)-1):
        for j in range(i+1, len(words)):
            # Only track 1 basewords ahead
            if j > i + 2:
                break
            if diedge_exists(words[i], words[j]):
                SEN[words[i]][words[j]]['weight'] = SEN[words[i]][words[j]]['weight'] + 1/(j-i)
            else:
                SEN.add_edge(words[i], words[j], weight=1/(j-i))
    sen_dic = {}
    for edge in SEN.edges.data('weight', default=1):
        sen_dic['_'.join([edge[0], edge[1]])] = edge[2]
    data.append(sen_dic)

In [5]:
df = pd.DataFrame(data).fillna(0)
del data

In [6]:
df.head()

Unnamed: 0,000_aint,000_brain,000_chain,000_chop,000_crap,000_doll,000_fac,000_nothin,000_plain,000_rol,...,“damn_boy”,“damn_think,“fuck_cup,“fuck_it”,“plain_jane”,“put_layaway”,“put_shit,“yadda_mean”,⟔so_damn,⟔so_many
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
df['Corpus'] = 1
df.loc[:shks_index, 'Corpus'] = 0
df.head()

Unnamed: 0,000_aint,000_brain,000_chain,000_chop,000_crap,000_doll,000_fac,000_nothin,000_plain,000_rol,...,“damn_think,“fuck_cup,“fuck_it”,“plain_jane”,“put_layaway”,“put_shit,“yadda_mean”,⟔so_damn,⟔so_many,Corpus
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [10]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

In [11]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Corpus'], axis=1), df['Corpus'], test_size=0.33)

In [12]:
clf = RandomForestClassifier(n_estimators=200, max_depth=3)
clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=200, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [13]:
from sklearn.metrics import accuracy_score
y_pred = clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8668768186226964

In [14]:
from sklearn.metrics import f1_score
f1_score(y_test, y_pred)

0.9286920379270035