In [13]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import SGDClassifier
from sklearn import metrics

%matplotlib

Using matplotlib backend: Qt5Agg


In [2]:
categories = ['alt.atheism', 'soc.religion.christian', 'comp.graphics', 'sci.med']

In [16]:
happy_train = pd.read_csv('./CrawlComment/final_final_happy.csv', sep='\t', header=None)
angry_train = pd.read_csv('./CrawlComment/final_final_angry.csv', sep='\t', header=None)
sad_train = pd.read_csv('./CrawlComment/final_final_sad.csv', sep='\t', header=None)
frames = [happy_train, angry_train, sad_train]

df = pd.concat(frames, ignore_index=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8591 entries, 0 to 8590
Data columns (total 2 columns):
0    8591 non-null object
1    8591 non-null int64
dtypes: int64(1), object(1)
memory usage: 134.4+ KB


In [17]:
X = df[0]
y = df[1]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=1, shuffle=True)
print(X_train.shape)
print(y_train.shape)
print(y_train.unique())
print(y_train.value_counts())

(6013,)
(6013,)
[1 0 3 2]
1    2130
3    1605
2    1159
0    1119
Name: 1, dtype: int64


In [5]:
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(X_train)
X_train_counts.shape

count_vect.vocabulary_.get(u'please')

4697

In [6]:
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(6013, 6994)

In [7]:
clf = MultinomialNB().fit(X_train_tfidf, y_train)

docs_new = ['somebody to love', 'I hate youtube', 'I am crying', 'you are beautiful', 'I get angry']
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)

predicted = clf.predict(X_new_tfidf)
predicted

array([1, 2, 3, 1, 2], dtype=int64)

In [8]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', MultinomialNB()),
])

text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [9]:
import numpy as np

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.6233514352211016

In [10]:
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier(loss='hinge', penalty='l2',
                          alpha=1e-3, random_state=42,
                          max_iter=100, tol=None)),
])

text_clf.fit(X_train, y_train)

predicted = text_clf.predict(X_test)
np.mean(predicted == y_test)

0.660589604344453

In [11]:
print(metrics.classification_report(y_test.astype(str), predicted.astype(str),
    target_names=['0', '1', '2', '3']))

metrics.confusion_matrix(y_test.astype(str), predicted.astype(str))

              precision    recall  f1-score   support

           0       0.65      0.31      0.42       412
           1       0.67      0.81      0.74       998
           2       0.62      0.58      0.60       506
           3       0.67      0.71      0.69       662

    accuracy                           0.66      2578
   macro avg       0.65      0.60      0.61      2578
weighted avg       0.66      0.66      0.65      2578



array([[127, 163,  48,  74],
       [ 30, 809,  72,  87],
       [ 16, 122, 296,  72],
       [ 23, 108,  60, 471]], dtype=int64)