In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

### Import the cleaned-up corpus text

In [9]:
corpus_df = pd.read_csv('Output/corpus.csv', index_col=0)
y = corpus_df['Label']
X = corpus_df['Corpus']

### Conduct train-test split

In [10]:
## Create new train test split for this exercise
X_train, X_test, y_train, y_test = train_test_split(X, y)

### Create base model with LogisticRegression

In [11]:
from sklearn.linear_model import LogisticRegression

In [5]:
m = LogisticRegression(solver='lbfgs') ## Specifying the solver to avoid getting FutureWarnings
m.fit(X_train,y_train)

ValueError: could not convert string to float: 'know feel good smile eye know walk wood guess try year old little girl vibin gong rubbin bowl float love strong place time grace want fall right love avow be fall grace be fall guru muhk be fall grace oh yeah be fall guru muhk move rapture capsule land star meditate morning head dress white beauty bazaar smell purple light comin heart lose wet treat like teacher cause s want pet place time grace want fall right love avow be fall grace be fall guru muhk be fall grace oh yeah be fall guru muhk be fall grace be fall guru muhk be fall grace uhm be fall guru muhk be fall grace be fall guru muhk be fall grace oh be fall guru muhk'

In [None]:
## Cross validation training score
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(m, X_train,y_train, cv=5)
cv_score

In [None]:
## Test score
m.score(X_test,y_test) 

In [None]:
y_pred = m.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
cmdf = pd.DataFrame(cm, index=['Act: Queen', 'Act: Rolling Stones'], columns=['Pred: Queen', 'Pred: Rolling Stones'])

In [None]:
cmdf

---

### Create model with Naive Bayes

In [6]:
from sklearn.naive_bayes import MultinomialNB

In [7]:
a_param = 0.1
nb = MultinomialNB(alpha=a_param)

In [8]:
nb.fit(X_train, y_train)

ValueError: could not convert string to float: 'know feel good smile eye know walk wood guess try year old little girl vibin gong rubbin bowl float love strong place time grace want fall right love avow be fall grace be fall guru muhk be fall grace oh yeah be fall guru muhk move rapture capsule land star meditate morning head dress white beauty bazaar smell purple light comin heart lose wet treat like teacher cause s want pet place time grace want fall right love avow be fall grace be fall guru muhk be fall grace oh yeah be fall guru muhk be fall grace be fall guru muhk be fall grace uhm be fall guru muhk be fall grace be fall guru muhk be fall grace oh be fall guru muhk'

In [None]:
## Cross validation training score
from sklearn.model_selection import cross_val_score

cv_score = cross_val_score(nb, X_train,y_train, cv=5)
cv_score

In [None]:
## Test score
nb.score(X_test,y_test) 

In [None]:
y_pred = nb.predict(X_test)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test,y_pred)

In [None]:
from sklearn.metrics import confusion_matrix

cm = confusion_matrix(y_test,y_pred)
cmdf = pd.DataFrame(cm, index=['Act: Queen', 'Act: Rolling Stones'], columns=['Pred: Queen', 'Pred: Rolling Stones'])

In [None]:
cmdf

### Build a scikit learn pipeline 

In [42]:
## Make scikit learn pipeline
p = make_pipeline(CountVectorizer(),TfidfTransformer(),MultinomialNB(alpha=0.01))

In [43]:
## Fit entire pipeline
model = p.fit(X_train, y_train)

In [44]:
## Create prediction from test data
y_pred = model.predict(X_test)

In [45]:
## Measure accuracy on test data
accuracy_score(y_test, y_pred)

0.67

In [46]:
## Confusion matrix
cm = confusion_matrix(y_test,y_pred)
cmdf = pd.DataFrame(cm, index=['Act: Queen', 'Act: Rolling Stones', 'Act: Miley Cyrus', 'Act: RHCP'], columns=['Pred: Queen', 'Pred: Rolling Stones', 'Pred: Miley Cyrus', 'Pred: RHCP'])

In [47]:
cmdf

Unnamed: 0,Pred: Queen,Pred: Rolling Stones,Pred: Miley Cyrus,Pred: RHCP
Act: Queen,22,3,1,0
Act: Rolling Stones,2,16,1,1
Act: Miley Cyrus,4,3,17,4
Act: RHCP,7,6,1,12


---

### Next step is to include this into a .py file to create a program that predicts from any input text whether it is more likely to be from a Queen or a Rolling Stones song (see Queen_vs_RollingStones.py)