## NPL Project Notebook

#### Libraries, Imports and Data

In [None]:
import csv # csv reader
import nltk
from sklearn.svm import LinearSVC
from nltk.classify import SklearnClassifier
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from random import shuffle
from sklearn.pipeline import Pipeline
from sklearn import metrics as Skmet #used for precision_recall_fscore_support()
from operator import itemgetter #used to unpack turples
import re
import math

#### Preprocessing and Data Loading

In [65]:
def parseData(dataLine):
    line = dataLine[0]     
    character = dataLine[1] 
    gender = dataLine[2]   
    return (line, character, gender)

def parseContractions(dataLine):
    contraction = dataLine[0]     
    expansion = dataLine[1]    
    return (contraction.lower(), expansion.lower())

def preProcess(text):
    # Should return a list of tokens
    text = text.lower() #normalize the text 
    text = re.sub(r"\"", "", text) # speach mark removal.
    
    tokenizer = RegexpTokenizer(r"[']?\w+[']?\w?\w?") # splits sent into words maintain  " ' " to allow for later expansion.
    tokenizedTemp = tokenizer.tokenize(text)
    lemmatizer = WordNetLemmatizer()
    stopWords = set(stopwords.words('english')) #stop word removal.
    
    contractionExpandedTemp = []
    for w in tokenizedTemp:
        contemp = contractions.get(w)
        if  contemp != None:
            tempContract = contractions[w] #getting contraction dictionary value
            expandedContraction = tempContract.split(" ") #splits expanded cotnractions then adds it to the processed text.
            contractionExpandedTemp.extend(expandedContraction);
        else:
            contractionExpandedTemp.append(w)
    processedTemp = []
    
    for w in contractionExpandedTemp: #stop word removal and lemmatization
        if (w not in stopWords) or w == "": # removes stop words and blank strings
            processedTemp.append(w)  #lemmatizer.lemmatize(w))   
    return processedTemp

def loadData(path, Text=None):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter=',')
        for line in reader:
            (line, character, gender) = parseData(line)
            temp = preProcess(line)
            if not temp: #removes empty lines. Either fully trimmed or original empty.
                rawData.append((line, character, gender));              
                preprocessedData.append((preProcess(line), character, gender))
                
def loadContractions(path):
    with open(path,encoding='utf8') as f:
        reader = csv.reader(f, delimiter='\t')
        for line in reader:
            (contraction, expansion) = parseContractions(line)
            contractions[contraction] = expansion
            
def splitData(percentage): #splits the data into 4 sets
    dataSamples = len(rawData)
    halfOfData = int(len(rawData)/2)
    trainingSamples = int((percentage*dataSamples)/2)
    
    for (line, character, gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainDataBinary.append((toFeatureVector(preProcess(line)),gender))
    
    for (line, character, gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testDataBinary.append((toFeatureVector(preProcess(line)),gender))
        
    for (line, character, gender) in rawData[:trainingSamples] + rawData[halfOfData:halfOfData+trainingSamples]:
        trainDataMulti.append((toFeatureVector(preProcess(line)),character))
    
    for (line, character, gender) in rawData[trainingSamples:halfOfData] + rawData[halfOfData+trainingSamples:]:
        testDataMulti.append((toFeatureVector(preProcess(line)),character))
        
def toFeatureVector(tokens):
    lineDict = {} # Should return a dictionary containing features as keys, and weights as values
    
    for token in tokens: #adding to the line dict
        if token in lineDict:
            lineDict[token] = lineDict[token] + 1
        else:
            lineDict[token] = 1
            
    for token in tokens: #adds values to featureDict
        if token in featureDict:
            featureDict[token] = featureDict[token] + 1
        else:
            featureDict[token] = 1
    
    return lineDict

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Splitting dataset


#### Training and Cross validation

In [None]:
def trainClassifier(trainData):
    print("Training Classifier...")
    pipeline =  Pipeline([('svc', LinearSVC())])
    return SklearnClassifier(pipeline).train(trainData)

def crossValidate(dataset, folds):
    shuffle(dataset)
    #print(dataset[0])
    cv_results = []
    temp = []
    totalPrecision = 0
    totalRecall = 0
    totalFScore = 0
    totalAccuracy = 0
    foldSize = int(len(dataset)/folds) #the fold size
    for i in range(0,len(dataset),foldSize):
        tempDataSet = dataset[0:i] + dataset[i+foldSize:len(dataset)] # joins to parts of the list to form the dataset to test.
        testingSet = dataset[i:i+foldSize] #takes the fold size an i to find the current test data.
        trueLabels = list(map(itemgetter(1), testingSet)) #creates a 1D array of result labels
        testingSetRemovedLabel = list(map(itemgetter(0),testingSet))
        
        #print(testingSetRemovedLabel)
        #training
        classifier = trainClassifier(tempDataSet) #classifier using 9/10th of the dataset
        #classifing
        
        results = []
        for i in testingSetRemovedLabel:
            results.append(predictLabel(i,classifier))
        
        #predictLabels using 1/10th of the dataset
        #print(results[0:10])        
        #print(trueLabels[0:10])
        
        precision = Skmet.precision_score(trueLabels, results,pos_label="fake") #finds precision score
        recall = Skmet.recall_score(trueLabels, results,pos_label="fake") #finds recall score
        f_score = 2 * (precision * recall)/(precision + recall) #calculates f_score
        accuracy = Skmet.accuracy_score(trueLabels,results) #calculate accuracy
        totalPrecision += precision
        totalRecall += recall
        if math.isnan(f_score): #if f_score not a number won't add it to the total
            totalFScore += 0
        else:
            totalFScore += f_score
        totalAccuracy += accuracy
    
    cv_results = (totalPrecision/folds,totalRecall/folds,totalFScore/folds,totalAccuracy/folds)
        
    return cv_results



#### Main

In [66]:
featureDict = {} # A global dictionary of features
contractions = {} #dictionary of english contractions
rawData = []          # the filtered data from the dataset file
preprocessedData = [] # the preprocessed reviews 
trainDataBinary = []   # the training data with the binary gender labels.
testDataBinary = [] # the test data currently 10% with the binary gender labels.
trainDataMulti = [] # the training data with the multi class name labels.
testDataMulti = [] # the test data currently 10% with the multi class name labels.

trainingDataSource = "training.csv"
testingDataSource = "test.csv"
contractionsPath = "contractions.txt"

print("Now %d rawData, %d trainData, %d testData" % (len(rawData), len(trainData), len(testData)), "Preparing the dataset...",sep='\n')
3
loadContractions(contractionsPath)
loadData(trainingDataSource)  

print("Splitting dataset")
splitData(0.9)

crossValidationBinaryResults = crossValidate(trainDataBinary, 10)

crossValidationMultiResults = crossValidate(trainDataMulti, 10)

Now 0 rawData, 0 trainData, 0 testData
Preparing the dataset...
Splitting dataset


NameError: name 'crossValidate' is not defined