In [1]:
import pandas as pd

def load_dataset(filename, cols):
    dataset = pd.read_csv(filename, encoding='latin-1')
    dataset.columns = cols
    return dataset

dataset = load_dataset("StrictOMD.csv", ['target', 'text'])

In [2]:
import re
import string
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))

def preprocess_tweet_text(tweet):
    tweet.lower()
    # Remove urls
    tweet = re.sub(r"http\S+|www\S+|https\S+", '', tweet, flags=re.MULTILINE)
    # Remove user @ references and '#' from tweet
    tweet = re.sub(r'\@\w+|\#\w+', '', tweet)
    # Remove punctuations
    tweet = tweet.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    tweet_tokens = word_tokenize(tweet)
    filtered_words = [w for w in tweet_tokens if not w in stop_words]

    ps = PorterStemmer()
    stemmed_words = [ps.stem(w) for w in filtered_words]
    lemmatizer = WordNetLemmatizer()
    lemma_words = [lemmatizer.lemmatize(w, pos='a') for w in stemmed_words]

    result = " ".join(lemma_words)
    # print(result)

    return result

dataset.text = dataset['text'].apply(preprocess_tweet_text)



In [8]:
import numpy as np
# from zeugma.embeddings import EmbeddingTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

def get_feature_vector(train_fit):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(train_fit)
    return vector

tf_vector = get_feature_vector(np.array(dataset.iloc[:, 1]).ravel())
# glove = EmbeddingTransformer('glove')

X = tf_vector.transform(np.array(dataset.iloc[:, 1]).ravel())
# X = glove.transform(np.array(dataset.iloc[:, 1]).ravel())
y = np.array(dataset.iloc[:, 0]).ravel()

In [9]:
from sklearn.model_selection import cross_val_score, ShuffleSplit
cv = ShuffleSplit(n_splits=5, test_size=0.3, random_state=0)

In [10]:
from sklearn.naive_bayes import MultinomialNB

NB_model = MultinomialNB()
scores = cross_val_score(NB_model, X, y, cv=cv)
print("Naive Bayes Model:", scores, sep='\t')



Naive Bayes Model:	[0.74545455 0.81818182 0.76363636 0.74909091 0.77818182]


In [11]:
from sklearn.linear_model import LogisticRegression

LR_model = LogisticRegression(solver='lbfgs')
scores = cross_val_score(LR_model, X, y, cv=cv)
print("Logistic Regression:", scores, sep='\t')



Logistic Regression:	[0.76727273 0.80363636 0.76       0.75272727 0.79636364]


In [12]:
from sklearn.svm import SVC

SVC_model = SVC(kernel='linear', C=1, random_state=42)
scores = cross_val_score(SVC_model, X, y, cv=cv)
print("Support vector machine:", scores, sep='\t')



Support vector machine:	[0.81090909 0.82545455 0.81454545 0.82181818 0.82909091]
