## NPL Project Notebook

#### Libraries, Imports and Data

In [5]:
import csv # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet #used for precision_recall_fscore_support()
from operator import itemgetter #used to unpack turples
import re
import math
import multiprocessing
from gensim.models import Word2Vec
from gensim.models import KeyedVectors
import time

#### Preprocessing and Data Loading

In [6]:
def parseData(dataLine):
    line = dataLine[0]     
    character = dataLine[1] 
    gender = dataLine[2]   
    return (line, character, gender)

def parseContractions(dataLine):
    contraction = dataLine[0]     
    expansion = dataLine[1]    
    return (contraction.lower(), expansion.lower())

def preProcess(text):
    # Should return a list of tokens
    text = text.lower() #normalize the text 
    text = re.sub(r"\"", "", text) # speach mark removal.
    
    tokenizer = RegexpTokenizer(r"[']?\w+[']?\w?\w?\w?\w?\w?") # splits sent into words maintain  " ' " to allow for later expansion.
    tokenizedTemp = tokenizer.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stopWords = set(stopwords.words('english')) #stop word removal.
    
    contractionExpandedTemp = []
    for w in tokenizedTemp:
        contemp = contractions.get(w)
        if  contemp != None:
            tempContract = contractions[w] #getting contraction dictionary value
            expandedContraction = tempContract.split(" ") #splits expanded cotnractions then adds it to the processed text.
            contractionExpandedTemp.extend(expandedContraction);
        else:
            contractionExpandedTemp.append(w)
    processedTemp = []
    
    for w in contractionExpandedTemp: #stop word removal and lemmatization
        if (w not in stopWords) or w == "": # removes stop words and blank strings
            processedTemp.append(w)  #lemmatizer.lemmatize(w))   
    return processedTemp

def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            (line, character, gender) = parseData(line)
            temp = preProcess(line)
            #if not temp: #removes empty lines. Either fully trimmed or original empty.
            rawData.append((line, character, gender));  
            preprocessedData.append((preProcess(line), character, gender))
                
def loadContractions(path):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (contraction, expansion) = parseContractions(line)
            contractions[contraction] = expansion
            
def splitData(percentage): #splits the data into 4 sets
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    for (line, character, gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainDataBinary.append((toFeatureVector(preProcess(line)),gender))
    
    for (line, character, gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testDataBinary.append((toFeatureVector(preProcess(line)),gender))
        
    for (line, character, gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainDataMulti.append((toFeatureVector(preProcess(line)),character))
    
    for (line, character, gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testDataMulti.append((toFeatureVector(preProcess(line)),character))
        
def toFeatureVector(tokens):
    lineDict = {} # Should return a dictionary containing features as keys, and weights as values
    for token in tokens: #adding to the line dict
        if token in lineDict:
            lineDict[token] = lineDict[token] + 1
        else:
            lineDict[token] = 1
            
    for token in tokens: #adds values to featureDict
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    
    return lineDict

#### Training and Cross validation

In [16]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

def crossValidate(dataset, folds):
    shuffle(dataset)
    #print(dataset[0])
    cv_results = []
    temp = []
    totalPrecision = 0
    totalRecall = 0
    totalFScore = 0
    totalAccuracy = 0
    foldSize = int(len(dataset)/folds) #the fold size
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        testingSetRemovedLabel = list(map(itemgetter(0),testingSet))
        
        #training
        classifier = trainClassifier(tempDataSet) #classifier using 9/10th of the dataset
        #classifing
        
        results = []
        for i in testingSetRemovedLabel:
            results.append(predictLabel(i,classifier))
        
        #print(results[0:10])        
        #print(trueLabels[0:10])
        
        precision = Skmet.precision_score(trueLabels, results,average='macro') #finds precision score
        #print(precision)
        recall = Skmet.recall_score(trueLabels, results,average='macro') #finds recall score
        #print(recall)
        f_score = 2 * (precision * recall)/(precision + recall) #calculates f_score
        #print(f_score)
        accuracy = Skmet.accuracy_score(trueLabels,results) #calculate accuracy
        #print(accuracy)
        totalPrecision += precision
        totalRecall += recall
        if math.isnan(f_score): #if f_score not a number won't add it to the total
            totalFScore += 0
        else:
            totalFScore += f_score
        totalAccuracy += accuracy
    
    cv_results = (totalPrecision/folds,totalRecall/folds,totalFScore/folds,totalAccuracy/folds)
        
    return cv_results

def predictLabels(lineSamples, classifier):
    return classifier.classify_many(map(lambda t: toFeatureVector(preProcess(t[1])), lineSamples))

def predictLabel(lineSample, classifier):
    return classifier.classify((lineSample))

def word2vecInitModel():
    cores = multiprocessing.cpu_count()
    word2vec = Word2Vec(min_count=3, #the kaggle tutorial used a much larger dataset 
                     window=2,
                     size=100,
                     sample=4e-6, 
                     alpha=0.03, 
                     min_alpha=0.0007, 
                     negative=20,
                     workers=cores-1)
    return word2vec

def word2vecVocabandTraining(dataset, model):
    t = time.time()
    datasetSansLabels = list(map(itemgetter(0),dataset))
    model.build_vocab(datasetSansLabels, progress_per=2000)
    print('Time to build vocab: {} mins'.format(round((time.time() - t) / 60, 2)))
    
    t = time.time();
    model.train(datasetSansLabels, total_examples=model.corpus_count, epochs=30, report_delay=1)
    print('Time to train the model: {} mins'.format(round((time.time() - t) / 60, 2)))
    return model

def crossValidateWithW2V(dataset, folds):
    shuffle(dataset)
    #print(dataset[0])
    cv_results = []
    temp = []
    totalPrecision = 0
    totalRecall = 0
    totalFScore = 0
    totalAccuracy = 0
    foldSize = int(len(dataset)/folds) #the fold size
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        testingSetRemovedLabel = list(map(itemgetter(0),testingSet))
        
        model = word2vecVocabandTraining(tempDataSet, word2vecInitModel());
        word_vectors = model.wv
        
        #transforming the word 2 vector into something useful.
        #creating sentence vectors using the word vector model.
        
        
        sentence_vectors_with_labels = create_sentence_vectors() 
        
        #training
        classifier = trainClassifier() #classifier using 9/10th of the dataset
        #classifing
        
        results = []
        for i in testingSetRemovedLabel:
            results.append(predictLabel(i,classifier))
        
        #print(results[0:10])        
        #print(trueLabels[0:10])
        
        precision = Skmet.precision_score(trueLabels, results,average='macro') #finds precision score
        #print(precision)
        recall = Skmet.recall_score(trueLabels, results,average='macro') #finds recall score
        #print(recall)
        f_score = 2 * (precision * recall)/(precision + recall) #calculates f_score
        #print(f_score)
        accuracy = Skmet.accuracy_score(trueLabels,results) #calculate accuracy
        #print(accuracy)
        totalPrecision += precision
        totalRecall += recall
        if math.isnan(f_score): #if f_score not a number won't add it to the total
            totalFScore += 0
        else:
            totalFScore += f_score
        totalAccuracy += accuracy
    
    cv_results = (totalPrecision/folds,totalRecall/folds,totalFScore/folds,totalAccuracy/folds)
        
    return cv_results


#### Main

In [15]:
featureDict = {} # A global dictionary of features
contractions = {} #dictionary of english contractions
rawData = []          # the filtered data from the dataset file
preprocessedData = [] # the preprocessed reviews 
trainDataBinary = []   # the training data with the binary gender labels.
testDataBinary = [] # the test data currently 10% with the binary gender labels.
trainDataMulti = [] # the training data with the multi class name labels.
testDataMulti = [] # the test data currently 10% with the multi class name labels.

trainingDataSource = "training.csv"
testingDataSource = "test.csv"
contractionsPath = "contractions.txt"
loadContractions(contractionsPath)
nltk.download("wordnet", "c:/nltk_data/")
nltk.download('stopwords', "c:/nltk_data/")


print("Now %d rawData, %d trainData, %d testData, %d trainData, %d testData" % (len(rawData), len(trainDataMulti), len(testDataMulti),len(trainDataBinary), len(testDataBinary)),"Preparing the datasets...",sep='\n')
loadData(trainingDataSource)  


print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainDataBinary), len(testDataBinary)),
      "Preparing training and test data...",sep='\n')
splitData(0.9)
# We print the number of training samples and the number of features
print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainDataBinary), len(testDataBinary)),
      "Training Samples: ", len(trainDataBinary), "Features: ", len(featureDict), sep='\n')

#crossValidationBinaryResults = crossValidate(trainDataBinary, 10)
#print("Precision, Recall, Fscore, Accuracy")
#print(crossValidationBinaryResults)

#crossValidationMultiResults = crossValidate(trainDataMulti, 10)

#print("Precision, Recall, Fscore, Accuracy")
#print(crossValidationMultiResults)

crossValidationBinaryResults = crossValidateWithW2V(trainDataBinary, 10)


[nltk_data] Downloading package wordnet to c:/nltk_data/...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to c:/nltk_data/...
[nltk_data]   Package stopwords is already up-to-date!


Now 0 rawData, 0 trainData, 0 testData, 0 trainData, 0 testData
Preparing the datasets...
Now 10113 rawData, 0 trainData, 0 testData
Preparing training and test data...
Now 10113 rawData, 9100 trainData, 1013 testData
Training Samples: 
9100
Features: 
5754
Time to build vocab: 0.0 mins
Time to train the model: 0.01 mins
<gensim.models.keyedvectors.Word2VecKeyedVectors object at 0x0000021A31E04548>
Training Classifier...


TypeError: 'int' object is not iterable

In [11]:
#print((model.wv.vocab))
print(model.wv.most_similar(positive=["sean"]))

[('get', 0.98886638879776), ('like', 0.9887083172798157), ('got', 0.9886667728424072), ('back', 0.988623321056366), ('want', 0.9886059761047363), ('going', 0.988605260848999), ('come', 0.9886035919189453), ('one', 0.9882631301879883), ('know', 0.9881845712661743), ('day', 0.9881817102432251)]


#### Results

##### Here are the results on a simple SVM 
###### Binary (Gender)
Precision, Recall, Fscore, Accuracy
(0.552863688427898, 0.5525422108455417, 0.5527028606377715, 0.5524175824175823)
###### Multiclass (Name)
Precision, Recall, Fscore, Accuracy
(0.17559268036085093, 0.1614969298891055, 0.16823929530879572, 0.19098901098901097)
##### Here are the results with the edition of Word2Vec Features
###### Binary (Gender)
Precision, Recall, Fscore, Accuracy

###### Multiclass (Name)
Precision, Recall, Fscore, Accuracy

