In [63]:
import pandas as pd

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.pipeline import Pipeline

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

import time

import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

In [66]:
def warn(*args, **kwargs):
    pass
import warnings
warnings.warn = warn

In [2]:
data = pd.read_csv("tyler.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,neg,neu,pos,compound,lyrics,album,year
0,0,0.23,0.643,0.127,-0.9991,This is what the devil plays before he goes to...,Bastard,2009
1,1,0.232,0.675,0.093,-0.9984,I'd tell him to eat a dick quicker than Mexica...,Bastard,2009
2,2,0.158,0.718,0.124,-0.9527,"Hello, the Hype is so Mellow Bitch I'm dope an...",Bastard,2009
3,3,0.196,0.696,0.108,-0.997,Got all the black bitches mad cause my main bi...,Bastard,2009
4,4,0.207,0.658,0.136,-0.9948,If this was a game I already know that I would...,Bastard,2009


In [5]:
data=data[['lyrics','album','year']]

In [7]:
data.head()

Unnamed: 0,lyrics,album,year
0,This is what the devil plays before he goes to...,Bastard,2009
1,I'd tell him to eat a dick quicker than Mexica...,Bastard,2009
2,"Hello, the Hype is so Mellow Bitch I'm dope an...",Bastard,2009
3,Got all the black bitches mad cause my main bi...,Bastard,2009
4,If this was a game I already know that I would...,Bastard,2009


In [25]:
X = data['lyrics']
y= data['album']

In [92]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.1,random_state=13)

In [70]:
cv_pipe_log = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('logreg', LogisticRegression(n_jobs=-1))
    ]
)

tv_pipe_log = Pipeline(
    [
        ('tvec', TfidfVectorizer()),
        ('logreg',LogisticRegression(n_jobs=-1))
    ]
)

In [71]:
cv_pipe_log_params ={
    'cvec__max_features':[4000,3000,5000],
    'cvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__C':[.005,.01,.25]
}

tv_pipe_log_params ={
    'tvec__max_features':[3000,4000,5000],
    'tvec__ngram_range':[(1,1),(1,2)],
    'logreg__penalty': ['l1','l2'],
    'logreg__C':[.005,.25,.01]
}

In [72]:
data['album'].value_counts(normalize=True)

Wolf           0.200000
Goblin         0.188889
Bastard        0.166667
Cherry Bomb    0.166667
Flower Boy     0.144444
IGOR           0.133333
Name: album, dtype: float64

In [73]:
t0=time.time()
gs = GridSearchCV(cv_pipe_log, param_grid=cv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(time.time()-t0)
print(gs.best_score_)
gs.best_params_

6.359632253646851
0.5074626865671642


{'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 2),
 'logreg__C': 0.25,
 'logreg__penalty': 'l2'}

In [74]:
t0=time.time()
gs = GridSearchCV(tv_pipe_log, param_grid=tv_pipe_log_params, cv=3)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.3283582089552239
5.710633993148804


{'logreg__C': 0.25,
 'logreg__penalty': 'l2',
 'tvec__max_features': 3000,
 'tvec__ngram_range': (1, 1)}

In [75]:
cv_pipe_gb = Pipeline(
    [
        ('cvec',CountVectorizer()),
        ('gb', GradientBoostingClassifier())
    ]
)

tv_pipe_gb = Pipeline(
    [
        ('tvec', TfidfVectorizer()),
        ('gb', GradientBoostingClassifier())
    ]
)

In [None]:
GradientBoostingClassifier()

In [110]:
cv_pipe_gb_params ={
    'cvec__max_features':[4000,5000],
    'cvec__ngram_range':[(1,1),(1,2)],
    'gb__learning_rate':[.1,0.25],
    'gb__n_estimators':[100,300],
    'gb__max_depth':[3,5]

}

tv_pipe_gb_params ={
    'tvec__max_features':[5000,10000,15000],
    'tvec__ngram_range':[(1,1),(1,2)],
    'gb__learning_rate':[.1,0.5,2],
    'gb__n_estimators':[100,300],
    'gb__max_depth':[3,5]

}

In [111]:
t0=time.time()
gs = GridSearchCV(cv_pipe_gb, param_grid=cv_pipe_gb_params, cv=4)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.43283582089552236
142.9311900138855


{'cvec__max_features': 4000,
 'cvec__ngram_range': (1, 1),
 'gb__learning_rate': 0.25,
 'gb__max_depth': 3,
 'gb__n_estimators': 100}

In [112]:
t0=time.time()
gs = GridSearchCV(tv_pipe_gb, param_grid=tv_pipe_gb_params, cv=4)
gs.fit(X_train, y_train)
print(gs.best_score_)
print(time.time()-t0)
gs.best_params_

0.373134328358209
320.99049186706543


{'gb__learning_rate': 0.1,
 'gb__max_depth': 3,
 'gb__n_estimators': 300,
 'tvec__max_features': 10000,
 'tvec__ngram_range': (1, 2)}

In [76]:
cvec =CountVectorizer(max_features=4000,ngram_range=(1,2))
logreg = LogisticRegression(C=0.25,penalty='l2')

In [77]:
# {'cvec__max_features': 4000,
#  'cvec__ngram_range': (1, 2),
#  'logreg__C': 0.25,
#  'logreg__penalty': 'l2'}

In [78]:
X_train_tf = cvec.fit_transform(X_train)
X_test_tf = cvec.transform(X_test)
logreg.fit(X_train_tf,y_train)
logreg.score(X_test_tf,y_test)

0.30434782608695654

In [106]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.25,random_state=13)

In [107]:
cvec =CountVectorizer()
gb = GradientBoostingClassifier(random_state=13)

In [108]:
X_train_tf = cvec.fit_transform(X_train)
X_test_tf = cvec.transform(X_test)
gb.fit(X_train_tf,y_train)
gb.score(X_test_tf,y_test)

0.4782608695652174

In [109]:
tvec = TfidfVectorizer()
gb = GradientBoostingClassifier(random_state=13)
X_train_tf = tvec.fit_transform(X_train)
X_test_tf = tvec.transform(X_test)
gb.fit(X_train_tf,y_train)
gb.score(X_test_tf,y_test)

0.391304347826087