# Lexical Comparison on Transcript and Tweets produced during the First 16' US Presidential Debate

_Luca Nannini_

- __Preparing the raw .text files of the official debate transcription__

- __Preparing the .csv file containing only the original tweets - no retweets - about the event__ 

- __Preparing a comparative analysis of lexical intersection between the datasets__

## Index 

[1. Cleaning the Transcripts](#1.-Cleaning-the-Transcripts)
> [1.1 Clinton's Speech](#1.1-Clinton's-Speech)

> [1.2 Trump's Speech](#1.2-Trump's-Speech)

> [1.3 Cleaned Debate Transcript Datasets](#1.3-Cleaned-Debate-Transcript-Datasets)

[2. Cleaning Tweets Dataset](#2.-Cleaning-Tweets-Dataset) 

[3. Comparative Analysis of Lexical Intersection](#3.-Comparative-Analysis-of-Lexical-Intersection)

***
_General Guidelines (NLTK)_:
1. Load the raw text.
1. Split into tokens.
1. Convert to lowercase.
1. Remove punctuation from each token.
1. Filter out remaining tokens that are not alphabetic.
1. Filter out tokens that are stop words.
1. Stemming the tokens.
***

## 1. Cleaning the Transcripts

### 1.1 Clinton's Speech

In [None]:
import numpy as np
import pandas as pd
import nltk 
import sklearn
import re

Clinton= r'C:\Users\Luca Nannini\Desktop\Thesis/CLINTON_DEBATE.txt'

In [None]:
type(Clinton)

In [None]:
Clinton = open(Clinton, 'r+')
text = Clinton.read()
Clinton.close()
wordsC = text.split()
wordsC = re.split(r'\W+', text)
print(wordsC)

In [None]:
wordsC = [word.lower() for word in wordsC]
print(wordsC)

In [None]:
import string

table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in wordsC]
print(stripped)

In [None]:
wordsC = [word for word in stripped if word.isalpha()]

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
wordsC = [w for w in wordsC if not w in stop_words]
print(wordsC)

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in wordsC]
print(stemmed)

In [None]:
wordsC = list(set(wordsC))
from collections import OrderedDict
OrderedDict((x, True) for x in wordsC).keys()

In [None]:
wordsC.remove('clinton')
wordsC.remove('hillaryclinton')
print(wordsC)

### 1.2 Trump's Speech

In [None]:
Trump= r'C:\Users\Luca Nannini\Desktop\Thesis/TRUMP_DEBATE.txt'

Trump = open(Trump, 'r+')
text = Trump.read()
Trump.close()
wordsT =text.split()
print(wordsT)

import re
wordsT = re.split(r'\W+', text)
print(wordsT)

import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in wordsT]
print(stripped)

In [None]:
wordsT = [word for word in stripped if word.isalpha()]
wordsT = [word.lower() for word in wordsT]
print(wordsT)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
wordsT = [w for w in wordsT if not w in stop_words]
print(wordsT)

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in wordsT]
print(stemmed)

In [None]:
wordsT = list(set(wordsT))
from collections import OrderedDict
OrderedDict((x, True) for x in wordsT).keys()

In [None]:
wordsT.remove('trump')
print(wordsT)

### 1.3 Cleaned Debate Transcript Datasets

In [None]:
Trump = print(wordsT)

In [None]:
Clinton = print(wordsC)

## 2. Cleaning Tweets Dataset

In [None]:
Tweets = r'C:\Users\Luca Nannini\Desktop\Thesis/TWEETS.txt'

Tweets = open(Tweets, 'r+')
text = Tweets.read()
Tweets.close()
Tw = text.split()
print(Tw)

import re
Tw = re.split(r'\W+', text)
print(Tw)

import string
table = str.maketrans('', '', string.punctuation)
stripped = [w.translate(table) for w in Tw]
print(stripped)

In [None]:
Tw = [word for word in stripped if word.isalpha()]
Tw = [word.lower() for word in Tw]
print(Tw)

In [None]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
Tw = [w for w in Tw if not w in stop_words]
print(Tw)

In [None]:
from nltk.stem.porter import PorterStemmer
porter = PorterStemmer()
stemmed = [porter.stem(word) for word in Tw]
print(stemmed)

In [None]:
Tweets = list(set(Tw))
from collections import OrderedDict
OrderedDict((x, True) for x in Tw).keys()

## 3. Comparative Analysis of Lexical Intersection

In [None]:
Merged_Transcription = MS = (set().union(wordsT, wordsC))
print(MS)
#merging in an unique list with no duplicate words Trump & Clinton's personal speeches

In [None]:
Matching_Words = set(MS).intersection(Tweets)
print(Matching_Words)
#comparison of words matching in both sets

In [None]:
Symmetric_Word_Difference = set(MS).symmetric_difference(set(Tweets))
print(Symmetric_Word_Difference)

In [None]:
len(Matching_Words)

In [None]:
len(Symmetric_Word_Difference)