In [223]:
from nltk.stem import WordNetLemmatizer

lemmatizer = WordNetLemmatizer()

#print("greater: ", lemmatizer.lemmatize("greater", pos="a"))
#print("greatest: ", lemmatizer.lemmatize("greatest", pos="a"))

We have a collection of tweets, and we are trying to predict whether or not a tweet was negative or positive. Our dataset is two text files, one labeled as all positive tweets, and another all negative tweets.

First, we clean the data by removing all words that are user handles, words that contain numbers, and any words that contain non-alphabetical characters. 

Next, we convert our data to numerical data by using Scikit learn's CountVectorizer, which tokenzizes the all the words in the dataset. An instance of CountVectorizor after calling fit_transform contains feature names, which is a set of all the unique words in our dataset, and can be accessed by calling get_feature_names on the vectorizer. Furthermore, we can see which features each sample in our dataset contains by calling fit_transform on our dataset and then casting that to an array.

Then, we split the dataset into a train and test dataset by calling train_test_split. We used 90% of the data as training data.

We used a Logistic Regression to create a model for our data, and with that, acheived a 80% accuracy rate.

It should be noted that the dataset was not stemmed or lemmatized. 

Questions:
Since we need to provide a part of speech for each word other than nouns, how do we effectively lemmatize a dataset ? 

In [206]:
#Helper functions
def hasNumbers(inputString):
    return any(char.isdigit() for char in inputString)

def hasOnlyAlphaChars(inputString):
    return str.isalpha(inputString)

In [207]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [208]:
#label the data. 1 for positive, 0 for negative.

data = []
data_labels = []
with open("pos_tweets.txt") as f:
    for i in f:
        data.append(i.strip())
        data_labels.append(1)

with open("neg_tweets.txt") as f:
    for i in f:
        data.append(i.strip())
        data_labels.append(0)

In [209]:
#clean the data
def remove_user_handles():
    for i in range(len(data)):
        if "@" in data[i]:
            nonUserHandleWords = []
            words = data[i].split(" ")
            for j in range(len(words)):
                if "@" not in words[j]:
                    nonUserHandleWords.append(words[j])
            data[i] = " ".join(nonUserHandleWords)
            
def remove_numbers():
    for i in range(len(data)):
        if hasNumbers(data[i]):
            nonNumberWords = []
            words = data[i].split(" ")
            for j in range(len(words)):
                if not hasNumbers(words[j]):
                    nonNumberWords.append(words[j])
            data[i] = " ".join(nonNumberWords)
                
def remove_non_alpha_words():
    for i in range(len(data)):
        if not hasOnlyAlphaChars(data[i]):
            alphaWords = []
            words = data[i].split(" ")
            for j in range(len(words)):
                if hasOnlyAlphaChars(words[j]):
                   alphaWords.append(words[j])
            data[i] = " ".join(alphaWords)
            
def clean_data():
    remove_user_handles()
    remove_numbers()
    remove_non_alpha_words()

clean_data()

In [210]:
# Vectorization: the general process of turning a collection of text documents into numerical feature vectors
# We can stop the use of stop words, such as "and" and "the", which are uninformative in some contexts, by specifying a stop word list 
vectorizer = CountVectorizer(analyzer="word")

features = vectorizer.fit_transform(data)
features_nd = features.toarray()

In [211]:
X_train, X_test, y_train, y_test = train_test_split(features_nd, data_labels, train_size=0.90, random_state=1000)

In [212]:
#Create a model
log_model = LogisticRegression()
log_model = log_model.fit(X=X_train, y=y_train)

In [213]:
#Predict positive or negative for our test dataset 
y_pred = log_model.predict(X_test)

In [220]:
def posOrNeg(flag):
    if (flag == 1):
        return "Positive"
    else:
        return "Negative"
    
def getPrediction(index):
    ind = features_nd.tolist().index(X_test[index].tolist())
    print("The prediction for the tweet: '" + data[ind] + "' is: " + posOrNeg(y_pred[index]) + " . True sentiment: " + posOrNeg(y_test[index]))

In [222]:
print("Model accuracy rate: ", accuracy_score(y_test, y_pred))

Model accuracy rate:  0.8059701492537313


In [221]:
getPrediction(84)

The prediction for the tweet: 'miss watching Modern' is: Negative . True sentiment: Negative
