In [None]:
# task for today 
# dataset - https://archive.ics.uci.edu/ml/datasets/Bag+of+words/

# Q1 - try to find out count of each and every word in their
#      respective files return as [(word, count)]
# Q2 - try to perform reduce operation to get a count of all the words
#      starting from word and return as [(a,50), (b,40)..]
# Q3 - try to filter out all the words from dataset by cleaning
#      punctuations 
# Q4 - create a tuple set of all the records available
#      in all the 5 files and then safe strored in SQLITE DB.
# 

In [1]:
# for logging 

import logging
import sys

logger = logging.getLogger()
logger.setLevel(logging.INFO)
formatter = logging.Formatter('%(asctime)s | %(levelname)s | %(message)s', 
                              '%m-%d-%Y %H:%M:%S')

stdout_handler = logging.StreamHandler(sys.stdout)
stdout_handler.setLevel(logging.DEBUG)
stdout_handler.setFormatter(formatter)

file_handler = logging.FileHandler('logs.log')
file_handler.setLevel(logging.DEBUG)
file_handler.setFormatter(formatter)

logger.addHandler(file_handler)
logger.addHandler(stdout_handler)


In [None]:
# Q1
# initialization 
newList = []
# reading a file 
try: 
    with open('../Python-Programs/data/vocab.enron.txt', 'r') as f:
        rec = f.read()
        words = rec.split()
        for i in words:
            countWord = rec.lower().count(i)
            newList.append((i, countWord))
except IOError:
    print("File does not exist")
except Exception as e:
    print("Program ended with an error ", e)
print(newList)   

In [3]:
# Q2 
# Initialization 
list_of_letters = set()
set_of_words = []
listTemp = []

try:
    # reading a file
    with open('../Python-Programs/data/vocab.enron.txt', 'r') as f:
        words = f.read()
        listOfWords = words.split('\n')
        # it will create unique words as per set
        # Get first letter from all the words and put them into bag of set 
        # data structure to get unique set 
        for i in listOfWords:
            if i != '':
                list_of_letters.add(i[0])
        # running loop for every character from above formed list
        for char in list_of_letters:
            # create a list which starts with unique characters from set  
            # formed then count its length
            for word in words.split():
                if word.startswith(char):
                    listTemp.append(word)
            countOfLetters = len(listTemp)
            set_of_words.append((char,countOfLetters))
            listTemp = []
except Exception as e:
    print('Program ended with exception as ', e)
except IOError:
    print("File not found")
# sort the output 
print(sorted(set_of_words, key = lambda x: x[-1]))

[('x', 62), ('z', 66), ('y', 111), ('q', 135), ('j', 377), ('k', 426), ('v', 469), ('u', 562), ('o', 685), ('n', 768), ('w', 805), ('g', 859), ('l', 989), ('h', 1000), ('i', 1084), ('f', 1160), ('t', 1366), ('e', 1404), ('b', 1557), ('m', 1648), ('d', 1664), ('r', 1723), ('a', 1800), ('p', 1945), ('c', 2611), ('s', 2826)]


In [None]:
# Q3
# declaring all the punctuations as list
punctuation= '''!()-[]{};:'"\, <>./?@#$%^&*_~+='''

# Initialization
newWord = ''
listOfWords = []
print("This program will remove all punctuation [ "  + punctuation +  " ] \
        from read file and write it back to new file")

try:
    # reading a file
    with open('../Python-Programs/data/vocab.pubmed.txt','r') as file:
        words = file.read()
        for word in words.split('\n'):
            # this is for validating each character of every word and get rid 
            # of punctuation
            for char in word:
                if char not in punctuation:
                    newWord = newWord + char
            if newWord != '':
                listOfWords.append(newWord)
                newWord = ''
except IOError:
    print("File does not exist")
except Exception as e:
    print("Program ended with an error ", e)

# writing a file 
with open('../Python-Programs/data/vocab.pubmed.Out.txt','w') as file:
    file.write('\n'.join(listOfWords))

In [None]:
# Q4
# Initialization
list_of_all_records = []
list_of_no_of_lines = []
try:
# reading 1st file 
    with open('../Python-Programs/data/vocab.pubmed.txt','r') as f1:
        for count, line in enumerate(f1):
            list_of_all_records.append(line.split('\n')[0])
        list_of_no_of_lines.append(count + 1)
        # reading 2nd file
        with open('../Python-Programs/data/vocab.enron.txt','r') as f2:
            for count, line in enumerate(f2):
                list_of_all_records.append(line.split('\n')[0])
            list_of_no_of_lines.append(count + 1)
            # reading 3rd file
            with open('../Python-Programs/data/vocab.kos.txt','r') as f3:
                for count, line in enumerate(f3):
                    list_of_all_records.append(line.split('\n')[0])
                list_of_no_of_lines.append(count + 1)
                # reading 4th file
                with open('../Python-Programs/data/vocab.nips.txt','r') as f4:
                    for count, line in enumerate(f4):
                        list_of_all_records.append(line.split('\n')[0])
                    list_of_no_of_lines.append(count + 1)
                    # reading 5th file
                    with open('../Python-Programs/data/vocab.nytimes.txt','r') as f5:
                        for count, line in enumerate(f5):
                            list_of_all_records.append(line.split('\n')[0])
                        list_of_no_of_lines.append(count + 1)
except IOError:
    print("File does not exist")
except Exception as e:
    print("Program ended with an error ", e)

#sum of total number of lines
print(sum(list_of_no_of_lines))
# list of all records
print(list_of_all_records)

# generating sequence number list 
list_of_sequence = [*range(0,sum(list_of_no_of_lines))]
# all records from 5 files as tuple
all_records_from_files = list(zip(list_of_sequence,list_of_all_records))

# just for fun writing whole data to file as string
with open('../Python-Programs/data/vocan.all.records.txt','w') as allFile:
    allFile.write(str(all_records_from_files))

# importing sqlite 
import sqlite3

# create a database
db = sqlite3.connect('iNeuron.db')

# create cursor 
cur = db.cursor()

# creating a new table 
cur.execute('create table allRecords(sequence int, data text)')

# deleting table's record if any 
cur.execute('delete from allRecords')

# inserting all records in bulk to SQLITE DB
for row in all_records_from_files:
    seq, data = row
    query = "insert into allRecords values (" + str(seq) + "," + '"' + data + '"' + ")"
    cur.execute(query)

# getting records from table to see whether records inserted successfully or not 
records = cur.execute('select * from allRecords')

for i in records:
    print(i)