In [None]:
import sys
import spacy
import re
import pickle
import numpy as np
import pandas as pd
import scipy as sp
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en.stop_words import STOP_WORDS 
from collections import Counter
from plotnine import *
from pandas.tseries.offsets import MonthBegin
from yellowbrick.features import Rank2D
import feather
import matplotlib.pyplot as plt
from sklearn.feature_selection import VarianceThreshold
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import RandomizedSearchCV
from imblearn.over_sampling import SMOTE, ADASYN
from imblearn.combine import SMOTEENN
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

In [None]:
# Read in data
pitchfork_data = feather.read_dataframe('pitchfork_tfidf.feather')


In [None]:
pitchfork_data.head()

### Let's take a look at the number of reviews made each year and the average score year to year.

In [None]:
pitchfork_data['month'] = pd.to_datetime(pitchfork_data['publication_date'], errors='coerce').dt.normalize() + MonthBegin(0)
pitchfork_data['month_label'] = pitchfork_data['month'].dt.strftime('%b')
# Get monthly mean of scores.
by_date = pitchfork_data.groupby('month')['score'].mean().reset_index()

#Get monthly counts of scores
by_date_count = pitchfork_data.groupby('month')['score'].size().reset_index()

# Merge Data together
by_date = by_date.merge(by_date_count, on='month', how='inner')

# Rename some columns
by_date = by_date.rename({"score_x": 'score', 'score_y': 'count'}, axis='columns')

In [None]:
pitchfork_data

In [None]:
# Let's plot the scores by each year. I fit a local regression line (loess) to indicate the general trend. 
# Note that use the plotnine library, which is updated frequently and uses ggplot2 syntax.
(
    ggplot(by_date, aes('month', 'score'))
    + geom_line()
    +scale_x_date(breaks='1 year')
    + theme(axis_text_x = element_text(angle=90))
    + geom_smooth(method='loess', color='red')
    + xlab('Year')
    + ylab('Average Album Score')
).save('yearly.png', width=8, height=4, units='in', dpi=500,)

In [None]:
# Let's plot the number  of reviews year over year. Note the bumps: seasonal effects due to the holidays.
(
    ggplot(by_date, aes('month', 'count'))
    + geom_line()
    +scale_x_date(breaks='1 year')
    + theme(axis_text_x = element_text(angle=90))
    + geom_smooth(method='loess', color='red')
)

### Are there any intrinsic differences between reviewers? Let's take a look.

In [None]:
reviewer_count = pitchfork_data.groupby('author')['score'].size().reset_index().sort_values('score', ascending=False)
reviewer_avg_score = pitchfork_data.groupby('author')['score'].mean().reset_index().sort_values('score', ascending=False)
reviewer_data = reviewer_count.merge(reviewer_avg_score, on='author', how='inner')
reviewer_data = reviewer_data.rename({'score_x': 'review_count', 'score_y':'averge_score'}, axis='columns')


In [None]:
# Let's do a bar plot of score counts and average review scores
(
    ggplot(reviewer_data.loc[0:25,:], aes('author', 'review_count'))
    + geom_bar(stat='identity')
    + theme(axis_text_x = element_text(angle=90)) + coord_flip()
)

In [None]:
# Let's do a bar plot of score counts and average review scores
(
    ggplot(reviewer_data.loc[0:25,:], aes('author', 'averge_score'))
    + geom_bar(stat='identity')
    + theme(axis_text_x = element_text(angle=90)) + coord_flip()
)

In [None]:
# Average scores don't necessarily tell the story. Let's take a look at the average scores for some of our most frequent
# reviewers.

joe_tangari = pitchfork_data.loc[pitchfork_data['author']=='Joe Tangari']
ian_cohen = pitchfork_data.loc[pitchfork_data['author']=='Ian Cohen']
steven_d = pitchfork_data.loc[pitchfork_data['author']=='Stephen M. Deusner']
stuart_berman = pitchfork_data.loc[pitchfork_data['author']=='Stuart Berman']

In [None]:
# Let's do a bar plot of score counts and average review scores
# Joe Tangary pretty consistent around 7-8
(
    ggplot(joe_tangari, aes('score'))
    + geom_histogram(color='black', fill='purple', bins=20)
    + scale_x_continuous(breaks=np.arange(0,11,1))
)

In [None]:
# Mr. Cohen's distribution is much wider, indicating he's a bit more
# Varied in his scoring.
(
    ggplot(ian_cohen, aes('score'))
    + geom_histogram(color='black', fill='purple', bins=20)
    + scale_x_continuous(breaks=np.arange(0,11,1))
)

In [None]:
(
    ggplot(steven_d, aes('score'))
    + geom_histogram(color='black', fill='purple', bins=20)
    + scale_x_continuous(breaks=np.arange(0,11,1))
)

In [None]:
(
    ggplot(stuart_berman, aes('score'))
    + geom_histogram(color='black', fill='purple', bins=20)
    + scale_x_continuous(breaks=np.arange(0,11,1))
)

In [None]:
joe_tangari = pitchfork_data.loc[pitchfork_data['author']=='Joe Tangari']
ian_cohen = pitchfork_data.loc[pitchfork_data['author']=='Ian Cohen']
steven_d = pitchfork_data.loc[pitchfork_data['author']=='Stephen M. Deusner']
stuart_berman = pitchfork_data.loc[pitchfork_data['author']=='Stuart Berman']

In [None]:
author_hist = pitchfork_data.loc[pitchfork_data['author'].isin(['Joe Tangari', 'Ian Cohen','Stephen M. Deusner', 'Stuart Berman', 'Brian Howe', 'Mark Richardson' ])]

In [None]:
(
    ggplot(author_hist, aes('score'))
    + geom_histogram(color='black', fill='purple', bins=20)
    + scale_x_continuous(breaks=np.arange(0,11,1)) 
    + facet_wrap("~ author", nrow=6)
)

### Let's take a look at word frequencies.

In [None]:
frequencies = pitchfork_data.loc[:,'like':]
frequencies = frequencies.sum().reset_index()

In [None]:
frequencies = frequencies.rename({'index':'word', 0:'count'}, axis='columns')

In [None]:
(
    ggplot(frequencies.loc[:50, :].sort_values('count'), aes(x='word',y='count'))
    + geom_bar(stat='identity', color='black', fill='purple')
    + coord_flip()
)

### Let's build out our dataframes to prepare for other exploratory analysis and modeling.

In [None]:
y = pitchfork_data.category.values

In [None]:
X

In [None]:
month_dummies = pd.get_dummies(pitchfork_data['month_label'])

In [None]:
author_dummies = pd.get_dummies(pitchfork_data['author'])


In [None]:
dummies = month_dummies.join(author_dummies, rsuffix='r')

In [None]:
scores = pitchfork_data['score'].reset_index()
scores.drop(['index'], inplace=True, axis='columns')


In [None]:
all_pitchfork_data = scores.join(dummies)

In [None]:
feather.write_dataframe(all_pitchfork_data, 'all_pitchfork_data_w_dummies.feather')

In [None]:
all_pitchfork_data

In [None]:
features = all_pitchfork_data.columns[1:]
visualizer = Rank2D(features=features, algorithm='pearson')
y = all_pitchfork_data['score'].reset_index()
y.drop(['index'], axis='columns', inplace=True)
#X = all_pitchfork_data.iloc[:,1:]
#X = X[features].as_matrix()
#y = y.as_matrix()
visualizer.fit(X, y)                # Fit the data to the visualizer
visualizer.transform(X)             # Transform the data
visualizer.poof()   

In [None]:
thresholder = VarianceThreshold(threshold=.01)
X_high_variance = thresholder.fit_transform(X)

In [None]:
def feature_selection(data_set, feature_names):
        """

        :param data_set:
        :return:
        """
        sel = VarianceThreshold(threshold=.001)
        feature_set = sel.fit_transform(data_set)

        fea_index = []
        for A_col in np.arange(data_set.shape[1]):
            for B_col in np.arange(feature_set.shape[1]):
                if (data_set[:, A_col] == feature_set[:, B_col]).all():
                    fea_index.append(A_col)

        check = {}
        for i in fea_index:
            check[feature_names[i]] = data_set[0][i]
        print (np.array(check))

        return feature_set, fea_index 

In [None]:
feature_set, fea_index =feature_selection(X, features)

In [None]:
fea_index

In [None]:
X = all_pitchfork_data.iloc[:,1:]
X = X.iloc[:,fea_index]
X

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=np.random.randint(1,1000))

In [None]:
X_res, y_res=SMOTE().fit_sample(X_train,y_train)

In [None]:
parameters = {'n_estimators':np.arange(1,650,50),
             'max_depth': np.arange(1,31,2),
             'min_samples_leaf': np.arange(1,31,1),
             'min_samples_split': np.arange(1,31,1)}

In [None]:
rf = RandomForestRegressor()

In [None]:
clf = RandomizedSearchCV(rf, parameters, random_state=42, n_iter=20, cv=5, n_jobs=-1, scoring='r2')

In [None]:
best_model = clf.fit(X_train, y_train)

In [None]:
best_model.cv_results_

In [None]:
lr = linear_model.LogisticRegression()
#cv_scores =  cross_val_score(lr, X_res, y_res, cv=10, scoring='f1')

In [None]:
cv_scores

In [None]:
C = np.logspace(0, 4, 15)
penalty = ['l1', 'l2']
hyperparameters = dict(C=C, penalty=penalty)
hyperparameters

In [None]:
clf = RandomizedSearchCV(lr, hyperparameters, cv=5, verbose=1, n_jobs=-1, n_iter=5, scoring='f1')

In [None]:
best_model = clf.fit(X_res, y_res)

In [None]:
best_model.best_score_

In [None]:
rf =RandomForestClassifier()

In [None]:
clf = RandomizedSearchCV(rf, parameters, random_state=42, n_iter=5, cv=5, n_jobs=-1, scoring='f1')
best_model_rf = clf.fit(X_res, y_res)

In [None]:
best_model_rf.best_score_

In [None]:
best_model.predict(X)

In [None]:
best_model.best_estimator_

In [None]:
from yellowbrick.classifier import ROCAUC

In [None]:
fig = plt.figure(figsize=(10,10), dpi=250)
visualizer = ROCAUC(best_model.best_estimator_, micro=False, macro=False, per_class=True, classes=['Best New Music', 'Not Best New Music'])
visualizer.fit(X_res, y_res)
visualizer.score(X_test, y_test)
g = visualizer.poof()  

In [None]:
from yellowbrick.classifier import ConfusionMatrix
fig = plt.figure(figsize=(8,8), dpi=250)
cm = ConfusionMatrix(best_model.best_estimator_, percent=True)
cm.fit(X_res, y_res)
cm.score(X_test, y_test)
cm.poof()
fig.savefig('cm.png', bbox_inches='tight')

In [None]:
len(X_res)

In [None]:
features = pitchfork_data.iloc[:,6:-1].columns

In [None]:
from yellowbrick.features.importances import FeatureImportances
import matplotlib.pyplot as plt

In [None]:
# Create a new figure
fig = plt.figure(figsize=(5,100), dpi=250)
ax = fig.add_subplot()
labels = list(map(lambda s: s.title(), features))
viz = FeatureImportances(best_model.best_estimator_, ax=ax, labels=labels, relative=False)
viz.fit(X_res, y_res)
viz.poof()

In [None]:
coefficients = pd.concat([pd.DataFrame(X.columns),pd.DataFrame(np.transpose(best_model.best_estimator_.coef_))], axis = 1)

In [None]:
coefficients.columns = ['word', 'coef']

In [None]:
coefficients.to_csv('coefficients.csv', index=False)

In [None]:
(
    ggplot(coefficients.loc[0:25,:], aes('word', 'coef'))
    + geom_bar(stat='identity')
    + theme(axis_text_x = element_text(angle=90)) + coord_flip()
)

In [None]:
import pandas as pd
import numpy as np
from pandas.api.types import CategoricalDtype
from plotnine import *
from plotnine.data import mpg

In [None]:
coef_list = coefficients['word'].value_counts().index.tolist()
coef_cat = CategoricalDtype(categories=coef_list, ordered=True)
coefficients['coef_cat']=coefficients['word'].astype(str).astype(coef_cat)

In [None]:
(
    ggplot(coefficients.loc[0:25,:], aes('coef_cat', 'coef'))
    + geom_bar(stat='identity')
    + theme(axis_text_x = element_text(angle=90)) + coord_flip()
)