In [34]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import nltk
from sklearn.naive_bayes import MultinomialNB
from nltk.stem.snowball import SnowballStemmer
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [35]:
# read dataset
dataset_file = pd.read_excel('D:\X\clean_tweets_all_a.xlsx')
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(dataset_file['Tweet'], dataset_file['Age'], random_state = 42, test_size=0.3)

In [36]:
# Support Vector Machines (SVM)
# svm_model = Pipeline([('vect', CountVectorizer()),('tfidf', TfidfTransformer()),
#                          ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, n_iter=5, random_state=42)),])
# _ = svm_model.fit(X_train.values.astype('U'), y_train)
# predict_svm_model = svm_model.predict(X_test.values.astype('U'))
# print("accuracy score: " + str(accuracy_score(predict_svm_model,y_test)))

In [37]:
# stemming and svm
stemmer = SnowballStemmer("english", ignore_stopwords=False)
#
#
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#
#
stemmed_count_vect = StemmedCountVectorizer()
text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),('tfidf', TfidfTransformer()),
                             ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=0.0001, n_iter=10, random_state=42)),])
text_mnb_stemmed = text_mnb_stemmed.fit(X_train.values.astype('U'), y_train)
predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test.values.astype('U'))
print("accuracy score: " + str(accuracy_score(predicted_mnb_stemmed,y_test)))



accuracy score: 0.9302325581395349


In [38]:
# print report
tn, fp, fn, tp = confusion_matrix(y_test, predicted_mnb_stemmed).ravel()
# tp: when it is predicted adult and is adult
print("tn: " + str(tn))
print("tp: " + str(tp))
print("fn: " + str(fn))
print("fp: " + str(fp))
print(classification_report(y_test, predicted_mnb_stemmed))

tn: 0
tp: 80
fn: 2
fp: 4
             precision    recall  f1-score   support

          0       0.00      0.00      0.00         4
          1       0.95      0.98      0.96        82

avg / total       0.91      0.93      0.92        86



In [39]:
print("X_train: " + str(len(X_train)))
print("X_test: " + str(len(X_test)))
print("y_train: " + str(len(y_train)))
print("y_test: " + str(len(y_test)))
print("predicted: " + str(len(predicted_mnb_stemmed)))

X_train: 199
X_test: 86
y_train: 199
y_test: 86
predicted: 86


In [None]:
# K-fold
from sklearn.model_selection import KFold, cross_val_score
#
class StemmedCountVectorizer(CountVectorizer):
    def build_analyzer(self):
        analyzer = super(StemmedCountVectorizer, self).build_analyzer()
        return lambda doc: ([stemmer.stem(w) for w in analyzer(doc)])
#
k_fold = KFold(n_splits=10)
X = dataset_file['Tweet']
y = dataset_file['Age']
for train_indices, test_indices in k_fold.split(X):
    X_train, X_test = X[train_indices], X[test_indices]
    y_train, y_test = y[train_indices], X[test_indices]
    #
    # svm
    # stemming and svm
    stemmer = SnowballStemmer("english", ignore_stopwords=False)
    #
    #
    stemmed_count_vect = StemmedCountVectorizer()
    text_mnb_stemmed = Pipeline([('vect', stemmed_count_vect),('tfidf', TfidfTransformer()),
                                 ('clf-svm', SGDClassifier(loss='hinge', penalty='l2',alpha=0.0001, n_iter=10, random_state=42)),])
    text_mnb_stemmed = text_mnb_stemmed.fit(X_train.values.astype('U'), y_train)
    predicted_mnb_stemmed = text_mnb_stemmed.predict(X_test.values.astype('U'))
    print("accuracy score: " + str(accuracy_score(predicted_mnb_stemmed,y_test)))