In [None]:
'''Q2: classifying a new body of text
- input sentence
- split, take count of each word, create such a dictionary
- two such corpuses [eg test cricket commentary, and a book phrase]

- using a new input check similarity to previous both
- check if new is similar to first or second one
naive bayes somehow involved
'''


import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def readFile(filePath):
    '''
    opens file in filePath and returns text inside it
    '''
    with open(filePath, 'r') as file:
        return file.read()
    
def processText(text):
    '''
    remove punctuation and returns in lower case
    '''
    lst = ['!', '.', ',', '?', ':', ';']
    for char in lst:
        text = text.replace(char, '')
    return text.lower()

def getWordFrequency(text):
    '''
    returns frequency of each word in corpus
    '''
    words = text.split()
    wordFrequency = {}
    for word in words:
        if word in wordFrequency.keys():
            wordFrequency[word] += 1
        else:
            wordFrequency[word] = 1
    return wordFrequency



# take input from user side
corpus1_path = r'corpus1.txt'
corpus1_text = readFile(corpus1_path)
corpus2_path = r'corpus2.txt'
corpus2_text = readFile(corpus2_path)
# input_path = r''
input_text = 'climate'

# preprocessing + frequency finding
corpus1_processed = processText(corpus1_text)
corpus1_wordFrequency = getWordFrequency(corpus1_processed)

corpus2_processed = processText(corpus2_text)
corpus2_wordFrequency = getWordFrequency(corpus2_processed)

input_processed = processText(input_text)
input_wordFrequency = getWordFrequency(input_processed)

# finding similarity using code similarity
word_vocabulary = set(corpus1_wordFrequency.keys()).union(set(corpus2_wordFrequency.keys()))

# biulding a vector based on frequency of words
def build_vector(wordFrequency, vocabulary):
    return np.array([wordFrequency.get(word, 0) for word in vocabulary])

corpus1_vector = build_vector(corpus1_wordFrequency, word_vocabulary)
corpus2_vector = build_vector(corpus2_wordFrequency, word_vocabulary)
input_vector = build_vector(input_wordFrequency, word_vocabulary)

# reshaping vectors
corpus1_vector = corpus1_vector.reshape(1, -1)
corpus2_vector = corpus2_vector.reshape(1, -1)
input_vector = input_vector.reshape(1, -1)

input_corpus1_similarity = cosine_similarity(input_vector, corpus1_vector)
input_corpus2_similarity = cosine_similarity(input_vector, corpus2_vector)

if input_corpus1_similarity > input_corpus2_similarity:
    print("Similar to corpus 1")
else:
    print("Similar to corpus 2")


In [None]:
# iterator example

import time

class InfIterEven:
    '''Infinite iterator to return all even numbers'''
    def __iter__(self):
        '''returns iterator object'''
        self.num = 0
        return self
    
    def __next__(self):
        '''used to get next element'''
        num = self.num
        self.num += 2
        return num
    

a = iter(InfIterEven())
while True:
    print(next(a))
    time.sleep(0.50)

In [None]:
# create generator function that yields powers of a given number upto specified limit


import time

def powerGenerator(number, power_limit):
    init_power = 1
    while init_power <= power_limit:
        yield pow(number, init_power)
        init_power += 1


base = 2
max_power = 10
x = powerGenerator(base, max_power)
# while True:
#     print(next(x), end=" ")
#     time.sleep(0.50)
for i in x:
    print(i, end=" ")
    time.sleep(0.50)    
print()

In [None]:
# we can also use Counter from collections

from collections import Counter
words = ["hello", "there", "hello"]
Counter(words)

In [None]:
# for removing punctuations

textline = "Copyright 1990, Jim Prentice, Brandon, CANADA, CANADA, Brandon."
translator = str.maketrans('', '', ',-.')
textline.translate(translator)

In [None]:
# stop words

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
print(stop_words)



In [None]:
'''100west.txt
1. read the file
1.1 remove punctuations
1.2 remove stopwords
2. get a count of all words present in the file in a dictionary format
3. convert dictionary into json format and dump into a file
'''


import json
import nltk
from nltk.corpus import stopwords


def readFile(filePath):
    '''opens file presint in filePath and returns text inside it'''
    with open(filePath, 'r') as file:
        return file.read()
    

def processText(text):
    '''remove punctuations and stopwords and return in string format'''
    lst = [',', '.', ';', ':', '?', '!', '\"', '(', ')']
    # lst = [',.;:?!\"(){}[]']
    for char in lst:
        text = text.replace(char, ' ')

    # removing stopwords
    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word.lower() for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    
    return filtered_text
    

def getWordFrequency(text):
    '''returns a dictionary of all word frequency'''
    words = text.split()
    wordFrequency = {}
    for word in words:
        if word in wordFrequency.keys():
            wordFrequency[word] += 1
        else:
            wordFrequency[word] = 1
    return wordFrequency


def dumpToJsonFile(data, outputFilePath):
    '''dump contents of dictionary data to givem filepath'''
    with open(outputFilePath, 'w') as file:
        json.dump(data, file, indent=4)




# save path of file here
filePath = r'100west.txt'

# get all text within the file
file_text = readFile(filePath)

# process text to remove all punctuations
file_processed = processText(file_text)

# get dictionary of word frequencies
file_wordFrequency = getWordFrequency(file_processed)



# dump data to a json file
dumpToJsonFile(file_wordFrequency, r'output_file.json')

In [None]:
# python code implementing inheritance and encapsulation


class Vehicle:
    # def __init__(self):
    #     pass
        
    def setName(self, name):
        self.name = name
        
    def getName(self):
        return self.name
    
    def setRegistrationNumber(self, regNo):
        self.regNo = regNo
        
    def getRegistrationNo(self):
        return self.regNo

    
    
class Car(Vehicle):
    # def __init__(self):
    #     self.axle = 2
    #     self.tyres = 4
        
    def setPassengerCount(self, passengerCount):
        self.passengerCount = passengerCount
        
    def getPassengerCount(self):
        return self.passengerCount
    
    
    
class Truck(Vehicle):
    # def __init__(self):
    #     pass
    
    def setCargoManifest(self, cargoManifest):
        self.cargoManifest = cargoManifest
        
    def getCargoManifest(self):
        return self.cargoManifest
    
    def setTruckPermitId(self, permitId):
        self.permitId = permitId
        
    def getPermitId(self):
        self.permitId
        
        
truckObject = Truck()

truckObject.setName("Truck 1")
