## Projekt

### Download NLTK packages

In [None]:
import nltk
nltk.download()

### Data Collection and Preprocessing

In [8]:
import csv
csv_file = open('Tweets.csv', encoding='utf8')
csv_reader = csv.reader(csv_file, delimiter = ',')
next(csv_reader)
lines = [row for row in csv_reader]
sentiments = [row[1] for row in lines]
print(sentiments)


['neutral', 'positive', 'neutral', 'negative', 'negative', 'negative', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'positive', 'positive', 'positive', 'positive', 'negative', 'positive', 'negative', 'positive', 'positive', 'negative', 'positive', 'positive', 'neutral', 'negative', 'negative', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'neutral', 'negative', 'negative', 'positive', 'neutral', 'positive', 'positive', 'neutral', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'positive', 'positive', 'neutral', 'neutral', 'neutral', 'negative', 'neutral', 'neutral', 'positive', 'neutral', 'negative', 'negative', 'positive', 'negative', 'neutral', 'neutral', 'neutral', 'negative', 'positive', 'positive', 'neutral', 'neutral', 'negative', 'neutral', 'negative', 'positive', 'negative', 'negative', 'negative', 'negative',

### Splitting the data into training and testing

In [18]:
data_len = len(lines)
train_cnt = int(data_len * 0.8)
test_cnt = data_len - train_cnt
train = lines[:train_cnt]
test  = lines[train_cnt:]

11712 11712


### Tokenize, normalization, stemming, stop-word removal, case folding, and URL removal

&nbsp;

## Exercise 1-2

### Extract N-characters from a string

In [4]:
word = 'Hello'
first3 = word[:3]
print(first3)
last2 = word[-2:]
print(last2)

# same for lists
lst = ['Today','is','a','nice','day']
first2 = lst[:2]
print(first3)
last2 = lst[-2:]
print(last2)

Hel
lo
Hel
['nice', 'day']


### Split sentence using .slpit() and NLTK tokenizer

In [5]:
sentence = 'I need four paint colors: blue, gray, green, and red.'

print('Splitting on spaces')
splitted =  sentence.split(' ') # commas and colon are still there
print(splitted)
print('List length', len(splitted), '\n')
    
print('-----\nSplitting using word_tokenize')

from nltk import word_tokenize

tokenized = word_tokenize(sentence) # splits over spaces and other delimiters, commas and colon generated as tokens
print(tokenized)
print('List length', len(tokenized), '\n')
    
print('-----\nString length', len(sentence), '\n')

Splitting on spaces
['I', 'need', 'four', 'paint', 'colors:', 'blue,', 'gray,', 'green,', 'and', 'red.']
List length 10 

-----
Splitting using word_tokenize
['I', 'need', 'four', 'paint', 'colors', ':', 'blue', ',', 'gray', ',', 'green', ',', 'and', 'red', '.']
List length 15 

-----
String length 53 



### Convert to lowercase

In [6]:
sent = "Today Is A Bloweutiful Day"
sentLcase = sent.lower()
print(sentLcase,'\n-----')

lst = ['Today', 'Is', 'A', 'Beautiful', 'Day']
lsttLcase = [x.lower() for x in lst]
print(lsttLcase)

today is a bloweutiful day 
-----
['today', 'is', 'a', 'beautiful', 'day']


&nbsp;
## Exercise 1-3

### Dictionary

In [7]:
fruitColor = {'apple': 'red', 'banana':'yellow', 'orange':'orange'}
print(fruitColor)

# another way to initialize a dictonary
fruitColor = {}
fruitColor['apple'] = 'red'
fruitColor['banana'] = 'yellow'
fruitColor['orange'] = 'orange'
print(fruitColor)
print()

print('Looping over keys')
for key in fruitColor:
    print(key, ':', fruitColor[key])
print()
    
print('Printing keys and values')
print(fruitColor.keys())
print(fruitColor.values())

print()

print('Checking if a key exists in dict')
print('apple' in fruitColor, 'lemon' in fruitColor)
print()

test_keys = ['apple', 'lemon']
for test in test_keys:
    if test in fruitColor:
        print('key %s in dict' % test)
    else:
        print('key %s not in dict' % test)
print(fruitColor['lemon'])

{'banana': 'yellow', 'apple': 'red', 'orange': 'orange'}
{'banana': 'yellow', 'apple': 'red', 'orange': 'orange'}

Looping over keys
banana : yellow
apple : red
orange : orange

Printing keys and values
dict_keys(['banana', 'apple', 'orange'])
dict_values(['yellow', 'red', 'orange'])

Checking if a key exists in dict
True False

key apple in dict
key lemon not in dict


KeyError: 'lemon'

### defaultdict

In [8]:
from collections import defaultdict

dd = defaultdict(list)
for key in fruitColor:
    dd[key] = fruitColor[key]
    
print(dd)
print(dd['lemon'])

defaultdict(<class 'list'>, {'banana': 'yellow', 'apple': 'red', 'orange': 'orange'})
[]


### defaultdict VS dict

In [9]:
from collections import defaultdict

dd = defaultdict(int)
dd['met']+=1
print(dd)

d = dict()
d['met']+=1 # Error

defaultdict(<class 'int'>, {'met': 1})


KeyError: 'met'

&nbsp;
## Exercise 1-4

### Create document: using dictionary

In [10]:
import os

docs = {} # Document dictionary, key is file name, value is set of words in every file
i = 0

for root, dirs, files in os.walk('single-docs/'):
    for file in files[:20000]: # looping only over first 20,000 files
        
        with open(os.path.join(root,file), 'r', encoding="utf8") as f:
            doc_words = f.read().split()
            docs[file] = set(doc_words) #convert to set to query more efficiently
            
            # Printing progress
            if i%10000==0:
                print('At document', i) 
            i+=1

At document 0
At document 10000


### Querying for a word as input by user

In [11]:
import time

query = input('Enter query word ')

t1 = time.time()
found = list()

for doc_id in docs: # looping over all document keys
    if query in docs[doc_id]: # is the query word in the set of words for this document?
        found.append(doc_id) # append file name to found list
        
print('Time taken',time.time() - t1, 'seconds')
print('Length of found documents',len(found))
print('\nFirst Document found:')
print(open('single-docs/%s'%found[0], 'r').read())

Enter query word president
Time taken 0.0 seconds
Length of found documents 157

First Document found:
Horst KÃ¶hler

Horst KÃ¶hler (born 22 February 1943) is a German politician of the Christian Democratic Union. He was President of Germany from 2004 to 2010. He was the candidate of the two Christian Democratic sister parties, the CDU and the CSU, and the liberal FDP, for becoming President. KÃ¶hler was elected to his first five-year term by the Federal Assembly on 23 May 2004. He was reelected to a second term on 23 May 2009. About one year later, on 31 May 2010, he resigned from his office in a controversy over his comment on the role of the Bundeswehr in light of a visit to the troops in Afghanistan.

KÃ¶hler is an economist by profession. Prior to his election as President, KÃ¶hler had a career in politics, the civil service and as a banking executive. He was President of the European Bank for Reconstruction and Development from 1998 to 2000. He also served as the head of the Inte