In [1]:
import nltk, re
from nltk.metrics.distance  import edit_distance
from nltk.probability import FreqDist
from nltk.corpus import words

<h1 style='color:red'>Q1 - Regular Expressions:</h1>

<h4 style='color:black'><u>Functions:</u></h4>

In [2]:
def get_Time(string):
    return [w for w in string if re.search(r'^([0-1][0-9]|2[0-3]):[0-5][0-9]$', w)]

def get_Phonenumber(string):
    list_numbers=[]
    list_numbers.append(re.findall(r'0[2,3,4,8,9]-?\d{3}-?\d{4}', string))
    list_numbers.append(re.findall(r'\(05[0,2,3,4,5,8]\) \d{7}', string))
    list_numbers.append(re.findall(r'(05[0,2,3,4,5,8]-?\d{7})\b', string))
    return list_numbers

def get_Comments(string):
    return re.findall(r'/\*.*?\*/', string)
    
def get_thirty_num(string):  
    return re.findall(r'thirty(?:-one|-two|-three|-four|-five|-six|-seven|-eight|-nine)?\b', string)

def get_Dates(string): 
    return [w for w in string.split() if re.search(r'^\d{4}[\-\/\s]?((((0[13578])|(1[02]))[\-\/\s]?(([0-2][0-9])|(3[01])))|(((0[469])|(11))[\-\/\s]?(([0-2][0-9])|(30)))|(02[\-\/\s]?[0-2][0-9]))$', w)]

<h1 style='color:red'>Q2 - Similarity between strings:</h1>

<h4 style='color:black'><u>Helper Functions</u></h4>

In [3]:
def get_error_string(string): # find the error in the string 
    return re.findall(r'<ERR\starg=.*?</ERR>', string)

def get_tuples(list_errors): #make tuples of (current word, error word)
    list_tuples = []
    for error in list_errors:
        cur_word = error[error.find('=')+1:error.find('>')]
        mis_word = error[error.find('>')+2:error.find('/')-2]
        list_tuples.append((cur_word,mis_word))
        
    return(list_tuples)

def get_distance(list_tuples): # get a list of tuples and add to the tuples the distance -> (cur word, error word , distance)
    tuples_with_dis=[]
    for words_tuple in list_tuples:
        distance=nltk.edit_distance(words_tuple[0],words_tuple[1]) # compute distance 
        tuples_with_dis.append((words_tuple[0],words_tuple[1],distance))
    return tuples_with_dis
            
def get_number_of_distance(list_tuples,distance, operator): # 1- equal , 2- >distance (bigger than)
    count=0
    for words_tuple in list_tuples:
        if(operator ==1):
            if(words_tuple[2] == distance):
                count+=1
        if(operator ==2):
            if(words_tuple[2] > distance):
                count+=1
    return count

def get_list_by_distance(list_tuples,distance,operator): # 1- equal , 2- >distance (bigger than)
    list_by_distance=[]
    for words_tuple in list_tuples:
        if(operator ==1):
            if(words_tuple[2] == distance):
                list_by_distance.append((words_tuple[0].lower(),words_tuple[1].lower()))
        if(operator ==2):
            if(words_tuple[2] > distance):
                list_by_distance.append((words_tuple[0].lower(),words_tuple[1].lower()))
    return list_by_distance

<h4 style='color:black'><u>Pre-Processing </u></h4>

In [4]:
with open('misspellings_and_corrections.txt') as f: #open the file into lines
    lines = f.readlines()

lines_string = ' '.join(str(line) for line in lines)
error_lines = get_error_string(lines_string)
cor_mis_tuples_list = get_tuples(error_lines)

In [5]:
lines_string = ' '.join(str(line) for line in lines)
error_lines = get_error_string(lines_string)
cor_mis_tuples_list = get_tuples(error_lines)

<h3 style='color:black'>A:</h3>

In [6]:
tuples_with_distance_list = get_distance(cor_mis_tuples_list)
get_distance(tuples_with_distance_list)

[('sister', 'siter', 1),
 ('sister', 'siter', 1),
 ('goes', 'go', 2),
 ('sometimes', 'some times', 1),
 ('sometimes', 'some times', 1),
 ('club', 'clob', 1),
 ('bellringing', 'bell ringing', 1),
 ('watch', 'wakh', 2),
 ('front', 'frount', 1),
 ('second', 'sexeon', 3),
 ('watch', 'wach', 1),
 ('watch', 'wach', 1),
 ('cowboys', 'cow Boys', 2),
 ('sometimes', 'some times', 1),
 ('club', 'colbe', 3),
 ('watch', 'wach', 1),
 ('watch', 'wach', 1),
 ('think', 'thing', 1),
 ('TV', 'tv', 2),
 ('square', 'squar', 1),
 ('eyes', 'iyes', 1),
 ("o'clock", 'oclock', 1),
 ('knocked', 'nock', 3),
 ('at', 'a', 1),
 ("o'clock", 'oclock', 1),
 ('killed', 'kild', 2),
 ('saw', 'see', 2),
 ('been', 'bean', 1),
 ('knocked', 'nock', 3),
 ('called', 'cald', 2),
 ('came', 'cam', 1),
 ('killed', 'killd', 1),
 ('there', 'the', 2),
 ('Her', 'Here', 1),
 ('eyes', 'iyes', 1),
 ('have to', 'haveto', 1),
 ('before', 'be for', 2),
 ('anything', 'any thing', 1),
 ('else', 'als', 2),
 ('wheel', 'weel', 1),
 ('wheel', 'wee

<h3 style='color:black'>B:</h3>

In [7]:
mistake_per_1 = get_number_of_distance(tuples_with_distance_list,1,1) / (len(tuples_with_distance_list))
mistake_per_2 = get_number_of_distance(tuples_with_distance_list,2,1) / (len(tuples_with_distance_list))
mistake_per_bigger_than_2 = get_number_of_distance(tuples_with_distance_list,2,2) / (len(tuples_with_distance_list))

print("The percentage of errors with distance 1 is: " + str(mistake_per_1))
print("The percentage of errors with distance 2 is: " + str(mistake_per_2))
print("The percentage of errors with distance >2 is: " + str(mistake_per_bigger_than_2))

The percentage of errors with distance 1 is: 0.5540592535590612
The percentage of errors with distance 2 is: 0.29857637552904964
The percentage of errors with distance >2 is: 0.1473643709118892


<h3 style='color:black'>The 10 most common errors in a file for any distance edit = 1</h3>

In [8]:
fdist=FreqDist(get_list_by_distance(tuples_with_distance_list,1,1)) # make a list of freq
fdist.most_common(10)

[(('james', 'jame'), 21),
 (('too', 'to'), 20),
 (("don't", 'dont'), 20),
 (('to', 'two'), 17),
 (('her', 'here'), 16),
 (('got', 'go'), 13),
 (('off', 'of'), 13),
 (('his', 'is'), 12),
 (("that's", 'thats'), 11),
 (("it's", 'its'), 11)]

<h3 style='color:black'>The 10 most common errors in a file for any distance edit = 2</h3>

In [9]:
fdist=FreqDist(get_list_by_distance(tuples_with_distance_list,2,1)) # make a list of freq
fdist.most_common(10)

[(('their', 'there'), 20),
 (('there', 'their'), 17),
 (('here', 'hear'), 10),
 (('know', 'no'), 9),
 (('vet', 'vethn'), 9),
 (('looked', 'look'), 8),
 (("you're", 'your'), 7),
 (('their', 'they'), 7),
 (('there', 'they'), 6),
 (('farmer', 'farm'), 6)]

<h3 style='color:black'>The 10 most common errors in a file for any distance edit >2</h3>

In [10]:
fdist=FreqDist(get_list_by_distance(tuples_with_distance_list,2,2)) # make a list of freq
fdist.most_common(10) 

[(('farmer', 'frmh'), 10),
 (('stopped', 'stop'), 6),
 (('teddy boy', 'tedeboy'), 5),
 (('teddy boy', 'teddoy'), 4),
 (('cheetah', 'cheter'), 4),
 (('walked', 'wark'), 3),
 (('hutch', 'hudge'), 3),
 (('signpost', 'sine post'), 3),
 (('they', 'thrar'), 3),
 (('knocked', 'nock'), 2)]

<h3 style='color:black'>C:</h3>

In [49]:
def get_alternative_edit1(err_word): # the function get word and return alternative word with distance 1 from words.words libary
    alternative_words=[]
    for word in words.words():
        if(nltk.edit_distance(err_word,word.lower()) == 1):
            alternative_words.append(word.lower())
    return alternative_words   

<h3 style='color:black'>D:</h3>

ב-50 הטעויות הראשונות יש כפילויות ולכן סך הכל יש 40 מילים שונות 

In [56]:
alternative_words = {}
correct_word_exist = 0
first_50_error_tuple = cor_mis_tuples_list[0:50]
list_that_not_include = [] # רשימת המילים שהמילה הנכונה לא הופיעה במועמדים
for error in first_50_error_tuple:
    err_word = error[1].lower()
    corr_word = error[0].lower()
    if(err_word not in alternative_words.keys()):
        alternative_words[err_word] = list(set(get_alternative_edit1(err_word))) # מכיוון שהפכנו את כל המילים לאותיות קטנות אז יש חזרות של מילים כאשר אחת עם אות קטנה והשנייה עם אות גדולה
        if(corr_word in alternative_words[err_word]):
            correct_word_exist += 1 # אם המילה נמצאת, מעלה את המונה
        else:
            list_that_not_include.append(err_word) # אחרת מוסיף אותה לרשימת המילים שלא היו ברשימה שלהן
            

In [65]:
import statistics
list_mean = []

for word_list in alternative_words.values():
    list_mean.append(len(word_list))
    
print("The average of candidates for each word is: " + str(statistics.mean(list_mean)))   
print("The percentage of cases the correct word was included in the list " + str(round((correct_word_exist/len(list_mean)),2)) +"%")   

The average of candidates for each word is: 18.425
The percentage of cases the correct word was included in the list 0.4%


In [58]:
first_50_error_tuple

[('sister', 'siter'),
 ('sister', 'siter'),
 ('goes', 'go'),
 ('sometimes', 'some times'),
 ('sometimes', 'some times'),
 ('club', 'clob'),
 ('bellringing', 'bell ringing'),
 ('watch', 'wakh'),
 ('front', 'frount'),
 ('second', 'sexeon'),
 ('watch', 'wach'),
 ('watch', 'wach'),
 ('cowboys', 'cow Boys'),
 ('sometimes', 'some times'),
 ('club', 'colbe'),
 ('watch', 'wach'),
 ('watch', 'wach'),
 ('think', 'thing'),
 ('TV', 'tv'),
 ('square', 'squar'),
 ('eyes', 'iyes'),
 ("o'clock", 'oclock'),
 ('knocked', 'nock'),
 ('at', 'a'),
 ("o'clock", 'oclock'),
 ('killed', 'kild'),
 ('saw', 'see'),
 ('been', 'bean'),
 ('knocked', 'nock'),
 ('called', 'cald'),
 ('came', 'cam'),
 ('killed', 'killd'),
 ('there', 'the'),
 ('Her', 'Here'),
 ('eyes', 'iyes'),
 ('have to', 'haveto'),
 ('before', 'be for'),
 ('anything', 'any thing'),
 ('else', 'als'),
 ('wheel', 'weel'),
 ('wheel', 'weel'),
 ('sallies', 'sally'),
 ('others', 'other'),
 ('rounds', 'rouns'),
 ('do', 'don'),
 ('ring', 'rings'),
 ('be', 'we'

In [59]:
list_that_not_include # List of words not found in their alternative words list

['go',
 'bell ringing',
 'wakh',
 'sexeon',
 'cow boys',
 'colbe',
 'tv',
 'iyes',
 'oclock',
 'nock',
 'kild',
 'see',
 'cald',
 'killd',
 'the',
 'haveto',
 'be for',
 'als',
 'sally',
 'other',
 'rouns',
 'brakes',
 'carfull',
 'cynthia']

In [60]:
for word in list_that_not_include:
    print(alternative_words[word])
    print('\n')

['ago', 'ao', 'bo', 'do', 'ego', 'fo', 'g', 'g', 'ga', 'ga', 'ge', 'ge', 'geo', 'gi', 'gio', 'goa', 'gob', 'god', 'god', 'gog', 'goi', 'gol', 'gon', 'goo', 'gor', 'gor', 'gos', 'got', 'goy', 'ho', 'ho', 'io', 'io', 'jo', 'jo', 'ko', 'ko', 'lo', 'lo', 'mo', 'mo', 'no', 'no', 'o', 'o', 'po', 'po', 'ro', 'so', 'to', 'wo', 'yo', 'zo', 'do', 'no', 'so', 'to']


[]


['rakh', 'wah', 'waka', 'wake', 'wakf', 'wakhi', 'waky', 'wash', 'wath', 'wash']


['sexern', 'sexton']


[]


['cole', 'cole']


['t', 't', 'ta', 'tav', 'td', 'te', 'th', 'ti', 'ti', 'to', 'tu', 'v', 'v', 'to']


['ides', 'yes', 'yes']


['clock', 'ollock', 'clock']


['bock', 'cock', 'dock', 'hock', 'jock', 'jock', 'knock', 'lock', 'mock', 'neck', 'nick', 'nick', 'nook', 'ock', 'pock', 'rock', 'snock', 'sock', 'tock', 'yock', 'lock', 'neck', 'sock']


['gild', 'keld', 'kil', 'kill', 'kiln', 'kilo', 'kilp', 'kilt', 'kind', 'mild', 'wild', 'kind']


['bee', 'bee', 'cee', 'dee', 'fee', 'gee', 'gee', 'kee', 'lee', 'lee', 'nee', 'p


<div dir='rtl'>
<u> סיבות: </u>
 </div>  
 <br>
 <div dir='rtl'>
    
- הפונקציה נותנת לנו את המילים עם טעות 1 , אך מתוך 50 המילים הראשונות המילה הנכונה מגיעה אחרי יותר משינוי אחד ולכן לא מופיעה ברשימה 
- סיומות של מילה שהאלגוריתם מזהה אותם בצורה לא תקינה - לדוגמה 'rounds', 'rouns' הפונקציה החזירה round אך הוא החשיב את ה-s כחלק מהמילה ולא כרבים

      
</div>

  <div dir='rtl'>
<u>הצעות לשיפור:</u>
</div>
<br>
<div dir='rtl'>
    
- להחזיר בפונקציה מילים עם כל טעות אפשרית ואז נמצא את המילה הנכונה ברשימה
- להוריד מהמילה שבודקים סיומות של ed,ing וכו' 
-לבדוק לפי שורש המילה
  
</div>

<h1 style='color:red'>Q3 - Similarity between texts:</h1>

In [9]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords
import numpy as np
import pandas as pd
import nltk, re
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
from collections import Counter 
import matplotlib.pyplot as plt

# train documents:
news_group = fetch_20newsgroups(subset='train')

#news_group.filenames
news_group_data = news_group.data
news_group_target_names = news_group.target_names
news_group_target = news_group.target

# test documents:
news_group_test = fetch_20newsgroups(subset='test')
#news_group_test.filenames

In [2]:
# fit the data into vectors (without including stop words and undefined words)
stopwords = stopwords.words('english')

vectorizer = CountVectorizer(stop_words=stopwords,token_pattern=r'\b[a-zA-Z]{2,}\b')
vectorizer.fit(news_group_data)

CountVectorizer(stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours',
                            'ourselves', 'you', "you're", "you've", "you'll",
                            "you'd", 'your', 'yours', 'yourself', 'yourselves',
                            'he', 'him', 'his', 'himself', 'she', "she's",
                            'her', 'hers', 'herself', 'it', "it's", 'its',
                            'itself', ...],
                token_pattern='\\b[a-zA-Z]{2,}\\b')

In [3]:
list_words = list(vectorizer.vocabulary_.keys()) # list of our words

In [4]:
list_words

['lerxst',
 'wam',
 'umd',
 'edu',
 'thing',
 'subject',
 'car',
 'nntp',
 'posting',
 'host',
 'organization',
 'university',
 'maryland',
 'college',
 'park',
 'lines',
 'wondering',
 'anyone',
 'could',
 'enlighten',
 'saw',
 'day',
 'door',
 'sports',
 'looked',
 'late',
 'early',
 'called',
 'bricklin',
 'doors',
 'really',
 'small',
 'addition',
 'front',
 'bumper',
 'separate',
 'rest',
 'body',
 'know',
 'tellme',
 'model',
 'name',
 'engine',
 'specs',
 'years',
 'production',
 'made',
 'history',
 'whatever',
 'info',
 'funky',
 'looking',
 'please',
 'mail',
 'thanks',
 'il',
 'brought',
 'neighborhood',
 'guykuo',
 'carson',
 'washington',
 'guy',
 'kuo',
 'si',
 'clock',
 'poll',
 'final',
 'call',
 'summary',
 'reports',
 'keywords',
 'acceleration',
 'upgrade',
 'article',
 'shelley',
 'fair',
 'number',
 'brave',
 'souls',
 'upgraded',
 'oscillator',
 'shared',
 'experiences',
 'send',
 'brief',
 'message',
 'detailing',
 'procedure',
 'top',
 'speed',
 'attained',
 'cp

In [7]:
print(len(vectorizer.vocabulary_)) # number of total words from all texts

81604


In [8]:
# this function gets a vector of text and its length, and returns the TF vector
def computeTF(CountVector, bagOfWordsCount):
    for i in range(len(CountVector)):
        CountVector[i] = CountVector[i] / float(bagOfWordsCount)
    return CountVector

# this function returns the amount of words in the text
def number_of_words(text):
    list_words=[]
    for word in nltk.word_tokenize(text):
        if word.isalpha() and word.lower() not in stopwords:
            list_words.append(word)
    return len(list_words)
    

## TF

In [None]:
# for each text in the train data computes the TF value, using the computeTF function
vector_train_texts = [] # contains all the train texts as TF vectors
for text in news_groccup_data:
    train_cv = vectorizer.transform([text])
    train_arr = list(np.array(train_cv.toarray(), dtype = 'float32')) # for space efficiency
    TF_train = computcceTF(train_arr[0],number_of_words(text))
    vector_train_texts.append(TF_train)          

In [None]:
# for each text in the  test data computes the TF value, using the computeTF function
vector_test_texts = [] # contains all the test texts as TF vectors
for text in news_group_test.data:
    test_cv = vectorizer.transform([text])
    test_arr = list(np.array(test_cv.toarray(), dtype = 'float32')) # for space efficiency
    TF_test = computeTF(test_arr[0],number_of_words(text))
    vector_test_texts.append(TF_test)  

### Cosine Similarity

In [36]:
# find the cosine similarity with TF for each text in the text with each text in the train
cosine_similarity(vector_train_texts, vector_test_texts, dense_output=True)

array([[0.16755994, 0.11645254, 0.06447455, ..., 0.11481771, 0.16457605,
        0.08694906],
       [0.10223866, 0.10619193, 0.05810201, ..., 0.10144055, 0.14540166,
        0.04155199],
       [0.17719492, 0.13031621, 0.05656578, ..., 0.07195257, 0.10920127,
        0.05944164],
       ...,
       [0.03066598, 0.02435721, 0.03775938, ..., 0.03955456, 0.03543516,
        0.0925846 ],
       [0.09075298, 0.10211729, 0.07449681, ..., 0.09267081, 0.09787577,
        0.11987286],
       [0.12416854, 0.14793591, 0.0764451 , ..., 0.0960954 , 0.11478341,
        0.04686013]], dtype=float32)

In [37]:
df = pd.DataFrame(cosine_similarity(vector_train_texts, vector_test_texts,  dense_output=True))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,0.167560,0.116453,0.064475,0.075293,0.049992,0.087478,0.042343,0.075745,0.140803,0.080457,...,0.092189,0.149225,0.054428,0.092119,0.160704,0.104001,0.152039,0.114818,0.164576,0.086949
1,0.102239,0.106192,0.058102,0.051626,0.046928,0.058897,0.031798,0.035551,0.137460,0.046994,...,0.074557,0.130740,0.089923,0.062889,0.075858,0.082983,0.144223,0.101441,0.145402,0.041552
2,0.177195,0.130316,0.056566,0.073853,0.027412,0.100187,0.066337,0.089001,0.123532,0.095237,...,0.093324,0.119491,0.054574,0.089216,0.089199,0.073321,0.120351,0.071953,0.109201,0.059442
3,0.117600,0.122147,0.089109,0.073521,0.124152,0.099014,0.073152,0.049072,0.170276,0.092665,...,0.189895,0.128901,0.122240,0.036169,0.059493,0.084222,0.103683,0.128350,0.133799,0.150214
4,0.101739,0.094277,0.093954,0.063605,0.141653,0.090356,0.068559,0.030661,0.148187,0.057898,...,0.143527,0.120808,0.141003,0.054238,0.078061,0.094721,0.136043,0.131228,0.086212,0.127984
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.069314,0.047189,0.042673,0.072739,0.056132,0.074156,0.080072,0.040285,0.086536,0.025358,...,0.087167,0.039193,0.090060,0.039591,0.060780,0.049168,0.087010,0.057474,0.036614,0.112107
11310,0.096181,0.061115,0.094743,0.048104,0.017218,0.038785,0.038888,0.052174,0.090521,0.065682,...,0.071642,0.129436,0.074981,0.092294,0.080122,0.065667,0.132285,0.124059,0.088911,0.058076
11311,0.030666,0.024357,0.037759,0.045533,0.050321,0.013249,0.061995,0.020794,0.082461,0.045810,...,0.072679,0.030345,0.047814,0.030653,0.040336,0.028551,0.011716,0.039555,0.035435,0.092585
11312,0.090753,0.102117,0.074497,0.047281,0.063179,0.078421,0.030578,0.047862,0.111850,0.038735,...,0.081938,0.113750,0.082542,0.078619,0.049737,0.112657,0.092459,0.092671,0.097876,0.119873


In [38]:
# predict the closet neighbor for each test using the cosine similarity
df_tf_cos_predict = df.idxmax(axis = 0)
df_tf_cos_predict

0        9048
1        4114
2        3172
3       10575
4       10278
        ...  
7527     9818
7528     9963
7529     2018
7530     2965
7531     7192
Length: 7532, dtype: int64

In [39]:
# find the topic using the topic of the closet neighbor according to the cosine similarity
new_df_predict = pd.DataFrame(columns=['similarity_text', 'per_group'])

for i in range(len(df_tf_cos_predict.values)):
    row_df = pd.DataFrame({'similarity_text': [df_tf_cos_predict.values[i]],'per_group': [news_group_target_names[news_group_target[df_tf_cos_predict.values[i]]]]})
    new_df_predict = new_df_predict.append(row_df,ignore_index=True)
new_df_predict   

  new_df_predict = new_df_predict.append(row_df,ignore_index=True)


Unnamed: 0,similarity_text,per_group
0,9048,sci.electronics
1,4114,sci.crypt
2,3172,alt.atheism
3,10575,sci.crypt
4,10278,alt.atheism
...,...,...
7527,9818,sci.space
7528,9963,rec.sport.baseball
7529,2018,rec.sport.baseball
7530,2965,alt.atheism


### Dot Product

In [9]:
dot_df = pd.DataFrame()

for test_count in range(len(vector_test_texts)):
    vector_list= []
    for train_count in range(len(vector_train_texts)):
        vector_list.append(np.dot(vector_test_texts[test_count],vector_train_texts[train_count]))
    dot_df[test_count] = vector_list
dot_df

  dot_df[test_count] = vector_list


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,0.005291,0.003378,0.002480,0.000927,0.002165,0.001263,0.002101,0.002381,0.004699,0.002087,...,0.002470,0.005952,0.001032,0.001889,0.004371,0.002055,0.004779,0.002811,0.003843,0.002486
1,0.002972,0.002836,0.002058,0.000585,0.001871,0.000783,0.001452,0.001029,0.004224,0.001122,...,0.001839,0.004801,0.001570,0.001187,0.001899,0.001510,0.004173,0.002286,0.003125,0.001094
2,0.003106,0.002098,0.001208,0.000505,0.000659,0.000803,0.001827,0.001553,0.002288,0.001371,...,0.001388,0.002646,0.000574,0.001015,0.001346,0.000804,0.002100,0.000978,0.001415,0.000943
3,0.003821,0.003647,0.003527,0.000932,0.005532,0.001471,0.003735,0.001587,0.005848,0.002474,...,0.005235,0.005291,0.002386,0.000763,0.001665,0.001713,0.003353,0.003233,0.003215,0.004420
4,0.002554,0.002175,0.002874,0.000623,0.004876,0.001037,0.002705,0.000766,0.003932,0.001194,...,0.003057,0.003831,0.002126,0.000884,0.001688,0.001488,0.003400,0.002554,0.001600,0.002910
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.001425,0.000891,0.001068,0.000583,0.001582,0.000697,0.002586,0.000824,0.001880,0.000428,...,0.001520,0.001018,0.001112,0.000528,0.001076,0.000632,0.001780,0.000916,0.000556,0.002087
11310,0.002894,0.001689,0.003472,0.000564,0.000710,0.000534,0.001838,0.001563,0.002878,0.001623,...,0.001828,0.004919,0.001355,0.001803,0.002076,0.001237,0.003961,0.002894,0.001978,0.001582
11311,0.001122,0.000819,0.001684,0.000650,0.002525,0.000222,0.003565,0.000758,0.003190,0.001377,...,0.002257,0.001403,0.001051,0.000728,0.001271,0.000654,0.000427,0.001122,0.000959,0.003069
11312,0.002497,0.002581,0.002497,0.000507,0.002383,0.000987,0.001322,0.001311,0.003253,0.000876,...,0.001913,0.003953,0.001364,0.001404,0.001179,0.001940,0.002532,0.001977,0.001991,0.002987


In [41]:
# predict the closet neighbor for each test using the dot product
tf_dot_predict = dot_df.idxmax(axis = 0)
tf_dot_predict

0        9229
1       10544
2        3172
3        3936
4        7996
        ...  
7527     2051
7528    10400
7529     4633
7530     2965
7531     9266
Length: 7532, dtype: int64

In [42]:
# find the topic using the topic of the closet neighbor according to the dot product
new_df_predict = pd.DataFrame(columns=['similarity_text', 'per_group'])

for i in range(len(tf_dot_predict.values)):
    row_df = pd.DataFrame({'similarity_text': [tf_dot_predict.values[i]],'per_group': [news_group_target_names[news_group_target[tf_dot_predict.values[i]]]]})
    new_df_predict = new_df_predict.append(row_df,ignore_index=True)
new_df_predict 

  new_df_predict = new_df_predict.append(row_df,ignore_index=True)


Unnamed: 0,similarity_text,per_group
0,9229,talk.religion.misc
1,10544,sci.med
2,3172,alt.atheism
3,3936,talk.religion.misc
4,7996,comp.graphics
...,...,...
7527,2051,misc.forsale
7528,10400,comp.graphics
7529,4633,comp.windows.x
7530,2965,alt.atheism


## TF-IDF

In [10]:
# fit the tf-idf

stopwords = stopwords.words('english')
TfidfV = TfidfVectorizer(stop_words=stopwords,token_pattern=r'\b[a-zA-Z]{2,}\b')
TfidfV .fit(news_group_data)
tfidf_train = TfidfV.fit_transform(news_group_data)
tfidf_test = TfidfV.transform(news_group_test.data)

In [11]:
from sklearn.metrics.pairwise import cosine_similarity

# find the cosine similarity with TF-IDF for each text in the text with each text in the train
cosine_similarity(tfidf_train, tfidf_test, dense_output=True)

array([[0.08332137, 0.03527536, 0.01853714, ..., 0.08742758, 0.05421031,
        0.06243403],
       [0.03270318, 0.0265544 , 0.0138742 , ..., 0.04038842, 0.05893349,
        0.02222038],
       [0.12736379, 0.0644189 , 0.04219289, ..., 0.1020233 , 0.06047534,
        0.07989148],
       ...,
       [0.04359934, 0.01646304, 0.02089325, ..., 0.05089863, 0.02096217,
        0.04041883],
       [0.04901693, 0.03379068, 0.03787461, ..., 0.05323483, 0.03579556,
        0.07712006],
       [0.01811485, 0.03240313, 0.00738827, ..., 0.02692533, 0.01564765,
        0.00873507]])

### Cosine Similarity

In [12]:
df = pd.DataFrame(cosine_similarity(tfidf_train, tfidf_test,  dense_output=True))
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,0.083321,0.035275,0.018537,0.118299,0.024184,0.080315,0.008866,0.058625,0.033229,0.038413,...,0.052374,0.022142,0.073994,0.109261,0.074325,0.077680,0.044835,0.087428,0.054210,0.062434
1,0.032703,0.026554,0.013874,0.071159,0.013450,0.039661,0.011453,0.019183,0.028105,0.018120,...,0.024914,0.018733,0.052463,0.051162,0.021390,0.042181,0.032917,0.040388,0.058933,0.022220
2,0.127364,0.064419,0.042193,0.182413,0.034789,0.112444,0.037736,0.075831,0.092063,0.077248,...,0.066659,0.040664,0.115229,0.157864,0.057421,0.116486,0.052006,0.102023,0.060475,0.079891
3,0.034234,0.025749,0.027134,0.061443,0.028669,0.049527,0.007710,0.032078,0.028755,0.043404,...,0.056760,0.020785,0.042762,0.041888,0.014944,0.038819,0.017683,0.047798,0.032429,0.048921
4,0.064976,0.028349,0.039684,0.111516,0.053901,0.081240,0.013304,0.038398,0.042316,0.038549,...,0.043241,0.028911,0.077037,0.084300,0.033363,0.079808,0.038025,0.076142,0.038687,0.076373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.051382,0.029407,0.022297,0.098721,0.032974,0.075335,0.021241,0.037797,0.034262,0.046881,...,0.038940,0.016997,0.069777,0.075766,0.036231,0.068229,0.060590,0.062102,0.030343,0.061808
11310,0.045498,0.021226,0.035215,0.087389,0.018055,0.049514,0.006679,0.030908,0.032490,0.030894,...,0.029142,0.018650,0.057320,0.072220,0.025060,0.051189,0.036202,0.058369,0.027292,0.026080
11311,0.043599,0.016463,0.020893,0.093904,0.018055,0.054786,0.012865,0.029916,0.034104,0.039360,...,0.041040,0.017048,0.068297,0.078398,0.024457,0.062752,0.014725,0.050899,0.020962,0.040419
11312,0.049017,0.033791,0.037875,0.078300,0.031965,0.071797,0.007815,0.038323,0.030658,0.034410,...,0.036514,0.030329,0.046550,0.069400,0.024503,0.069067,0.024432,0.053235,0.035796,0.077120


In [13]:
# predict the closet neighbor for each test using the cosine similarity
tfidf_cos_predict = df.idxmax(axis = 0)
tfidf_cos_predict

0        9048
1        4114
2        3172
3       10575
4       10278
        ...  
7527     9483
7528     2850
7529     6368
7530     2965
7531     9513
Length: 7532, dtype: int64

In [14]:
# find the topic using the topic of the closet neighbor according to the cosine similarity

new_df_tfidf = pd.DataFrame(columns=['similarity_text', 'per_group'])

for i in range(len(tfidf_cos_predict.values)):
    row_df = pd.DataFrame({'similarity_text': [tfidf_cos_predict.values[i]],'per_group': [news_group_target_names[news_group_target[tfidf_cos_predict.values[i]]]]})
    new_df_tfidf = new_df_tfidf.append(row_df,ignore_index=True)
new_df_tfidf 

Unnamed: 0,similarity_text,per_group
0,9048,sci.electronics
1,4114,sci.crypt
2,3172,alt.atheism
3,10575,sci.crypt
4,10278,alt.atheism
...,...,...
7527,9483,sci.space
7528,2850,comp.windows.x
7529,6368,rec.sport.hockey
7530,2965,alt.atheism


### Dot Product

In [None]:
df_tfidf_dot_predict = pd.DataFrame()
train_vectors = tfidf_train.toarray()
test_vectors = tfidf_test.toarray()
for test_count in range(len(test_vectors)):
    vector_list= []
    for train_count in range(len(train_vectors)):
        vector_list.append(np.dot(test_vectors[test_count],train_vectors[train_count]))
    df_tfidf_dot_predict[test_count] = vector_list

In [10]:
df_tfidf_dot_predict

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,7522,7523,7524,7525,7526,7527,7528,7529,7530,7531
0,0.083321,0.035275,0.018537,0.118299,0.024184,0.080315,0.008866,0.058625,0.033229,0.038413,...,0.052374,0.022142,0.073994,0.109261,0.074325,0.077680,0.044835,0.087428,0.054210,0.062434
1,0.032703,0.026554,0.013874,0.071159,0.013450,0.039661,0.011453,0.019183,0.028105,0.018120,...,0.024914,0.018733,0.052463,0.051162,0.021390,0.042181,0.032917,0.040388,0.058933,0.022220
2,0.127364,0.064419,0.042193,0.182413,0.034789,0.112444,0.037736,0.075831,0.092063,0.077248,...,0.066659,0.040664,0.115229,0.157864,0.057421,0.116486,0.052006,0.102023,0.060475,0.079891
3,0.034234,0.025749,0.027134,0.061443,0.028669,0.049527,0.007710,0.032078,0.028755,0.043404,...,0.056760,0.020785,0.042762,0.041888,0.014944,0.038819,0.017683,0.047798,0.032429,0.048921
4,0.064976,0.028349,0.039684,0.111516,0.053901,0.081240,0.013304,0.038398,0.042316,0.038549,...,0.043241,0.028911,0.077037,0.084300,0.033363,0.079808,0.038025,0.076142,0.038687,0.076373
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11309,0.051382,0.029407,0.022297,0.098721,0.032974,0.075335,0.021241,0.037797,0.034262,0.046881,...,0.038940,0.016997,0.069777,0.075766,0.036231,0.068229,0.060590,0.062102,0.030343,0.061808
11310,0.045498,0.021226,0.035215,0.087389,0.018055,0.049514,0.006679,0.030908,0.032490,0.030894,...,0.029142,0.018650,0.057320,0.072220,0.025060,0.051189,0.036202,0.058369,0.027292,0.026080
11311,0.043599,0.016463,0.020893,0.093904,0.018055,0.054786,0.012865,0.029916,0.034104,0.039360,...,0.041040,0.017048,0.068297,0.078398,0.024457,0.062752,0.014725,0.050899,0.020962,0.040419
11312,0.049017,0.033791,0.037875,0.078300,0.031965,0.071797,0.007815,0.038323,0.030658,0.034410,...,0.036514,0.030329,0.046550,0.069400,0.024503,0.069067,0.024432,0.053235,0.035796,0.077120


In [11]:
# predict the closet neighbor for each test using the dot product
tfidf_dot_predict = df_tfidf_dot_predict.idxmax(axis = 0)
tfidf_dot_predict

0        9048
1        4114
2        3172
3       10575
4       10278
        ...  
7527     9483
7528     2850
7529     6368
7530     2965
7531     9513
Length: 7532, dtype: int64

In [12]:
# find the topic using the topic of the closet neighbor according to the dot product
new_df_tfidf = pd.DataFrame(columns=['similarity_text', 'per_group'])

for i in range(len(tfidf_dot_predict.values)):
    row_df = pd.DataFrame({'similarity_text': [tfidf_dot_predict.values[i]],'per_group': [news_group_target_names[news_group_target[tfidf_dot_predict.values[i]]]]})
    new_df_tfidf = new_df_tfidf.append(row_df,ignore_index=True)
new_df_tfidf 

Unnamed: 0,similarity_text,per_group
0,9048,sci.electronics
1,4114,sci.crypt
2,3172,alt.atheism
3,10575,sci.crypt
4,10278,alt.atheism
...,...,...
7527,9483,sci.space
7528,2850,comp.windows.x
7529,6368,rec.sport.hockey
7530,2965,alt.atheism


In [12]:
# find the topic using the topic of the closet neighbor according to the dot product
new_df_tfidf = pd.DataFrame(columns=['similarity_text', 'per_group'])

for i in range(len(tfidf_dot_predict.values)):
    row_df = pd.DataFrame({'similarity_text': [tfidf_dot_predict.values[i]],'per_group': [news_group_target_names[news_group_target[tfidf_dot_predict.values[i]]]]})
    new_df_tfidf = new_df_tfidf.append(row_df,ignore_index=True)
new_df_tfidf 

Unnamed: 0,similarity_text,per_group
0,9048,sci.electronics
1,4114,sci.crypt
2,3172,alt.atheism
3,10575,sci.crypt
4,10278,alt.atheism
...,...,...
7527,9483,sci.space
7528,2850,comp.windows.x
7529,6368,rec.sport.hockey
7530,2965,alt.atheism


# Accuracy


<div style="font-size: 20px" dir='rtl'>
    <b> דיון בתוצאות:    </b>
</div>
<br>
<div dir='rtl'>
    ניתן לשים לב שבשיטת ה-TFIDF קיבלנו תוצאות טובות יותר מאשר בשיטת ה- TF
    כמו כן, תוצאת ה Cosine בכל אחת מהאפשרויות תמיד טובה יותר מה-Dot
</div>
<div dir='rtl'>
    התוצאה הטובה ביותר התקבלה ב-TF-IDF כאשר היא זהה ל-TF Dot product (הסבר למטה למה זה קרה )
</div>

## tf dot product accuracy:

In [11]:
tf_dot_predict = tf_dot_predict.values
tf_dot_predict = [news_group_target[i] for i in tf_dot_predict]
accuracy_score(news_group_test.target, tf_dot_predict)

0.35847052575677113

## tf cos accuracy:

In [45]:
df_tf_cos_predict = df_tf_cos_predict.values
df_tf_cos_predict = [news_group_target[i] for i in df_tf_cos_predict]

accuracy_score(news_group_test.target, df_tf_cos_predict)

0.6383430695698353

## tf-idf cos accuracy:

In [15]:
tfidf_cos_predict = tfidf_cos_predict.values
tfidf_cos_predict = [news_group_target[i] for i in tfidf_cos_predict]

accuracy_score(news_group_test.target, tfidf_cos_predict)

0.6724641529474243

## tf-idf dot product accuracy:

In [13]:
tfidf_dot_predict = tfidf_dot_predict.values
tfidf_dot_predict = [news_group_target[i] for i in tfidf_dot_predict]

accuracy_score(news_group_test.target, tfidf_dot_predict)

0.6724641529474243

<div dir='rtl'>
    
ניתן לראות שב-TFIDF הדיוק של ה-Cosine יצא זהה לדיוק של Dot product .
הרצנו את החישוב על כמות טקסטים קטנה יותר וגילינו שהשוני הוא בספרה מאוד רחוקה אחרי הנקודה ולכן יצא זהה.
    </div>
    <br>
    <div dir='rtl'>
הוספנו פה קוד לדוגמה:
</div>
    <div dir='rtl'>
לקחנו עשרה טקסטים מה-Test ו- Train ופעם אחת הרצנו Cos בפעם השנייה Dot וקיבלנו את אותם ווקטורים
</div>
  <div dir='rtl'>
* הערה חשובה : אנחנו בנינו את הוקטור של Dot ולכן האיבר הראשון של כל מערך נמצא במערך הראשון ב-Cos וכך הלאה.
    לדוגמה : 
    <br>dot[0][0] = cos[0][0] , dot[1][0] = cos[0][1] , dot[2][0] = cos [0][2]
    כלומר בסופו של דבר קיבלנו את אותו DF ואת אותו ייצוג אך ה-Cos בונה את המערך בצורה שונה אך עדיין זהה לשלנו!
</div>

In [23]:
train_vectors = tfidf_train[0:10].toarray()
test_vectors = tfidf_test[0:10].toarray()
vector_list= []
for test_count in range(10):
    vector_list.append(np.dot(train_vectors[0:10],test_vectors[test_count]))
vector_listresult = cosine_similarity(tfidf_train[0:10], tfidf_test[0:10],  dense_output=True)
result

array([[0.04146014, 0.01678133, 0.00341898, 0.0210587 , 0.00877006,
        0.01571061, 0.00172013, 0.01559119, 0.01281503, 0.01503676],
       [0.00905899, 0.01879483, 0.00303131, 0.01918302, 0.0054784 ,
        0.00780862, 0.00118769, 0.00392184, 0.01609298, 0.00286318],
       [0.06248992, 0.03530185, 0.01074101, 0.02378894, 0.00628912,
        0.02195819, 0.03409074, 0.02147107, 0.0544351 , 0.02172251],
       [0.01530909, 0.01718815, 0.01334994, 0.01495707, 0.01674917,
        0.01580445, 0.00594771, 0.00902019, 0.01642479, 0.02360123],
       [0.02380781, 0.01285243, 0.0192736 , 0.01451655, 0.02849627,
        0.02026429, 0.00598442, 0.00435259, 0.01367846, 0.00967355],
       [0.00458201, 0.00713575, 0.00774956, 0.01918331, 0.01502714,
        0.01729416, 0.01461944, 0.00439668, 0.01047133, 0.00176829],
       [0.01866082, 0.00688357, 0.00357296, 0.00629498, 0.00441365,
        0.01104234, 0.0017976 , 0.03882607, 0.00703882, 0.00365262],
       [0.0173333 , 0.00857167, 0.0157320

In [31]:
train_vectors = tfidf_train[0:10].toarray()
test_vectors = tfidf_test[0:10].toarray()
vector_list= []
for test_count in range(10):
    vector_list.append(np.dot(train_vectors[0:10],test_vectors[test_count]))
vector_list

[array([0.04146014, 0.00905899, 0.06248992, 0.01530909, 0.02380781,
        0.00458201, 0.01866082, 0.0173333 , 0.00763645, 0.01985888]),
 array([0.01678133, 0.01879483, 0.03530185, 0.01718815, 0.01285243,
        0.00713575, 0.00688357, 0.00857167, 0.01332965, 0.00768715]),
 array([0.00341898, 0.00303131, 0.01074101, 0.01334994, 0.0192736 ,
        0.00774956, 0.00357296, 0.01573207, 0.00381016, 0.00296694]),
 array([0.0210587 , 0.01918302, 0.02378894, 0.01495707, 0.01451655,
        0.01918331, 0.00629498, 0.01046154, 0.01060185, 0.01143382]),
 array([0.00877006, 0.0054784 , 0.00628912, 0.01674917, 0.02849627,
        0.01502714, 0.00441365, 0.00516953, 0.00124941, 0.00444025]),
 array([0.01571061, 0.00780862, 0.02195819, 0.01580445, 0.02026429,
        0.01729416, 0.01104234, 0.01266261, 0.00819684, 0.01554432]),
 array([0.00172013, 0.00118769, 0.03409074, 0.00594771, 0.00598442,
        0.01461944, 0.0017976 , 0.00042796, 0.00267759, 0.00727682]),
 array([0.01559119, 0.00392184, 0.

# 'סעיף ג

### בחרנו להשתמש ב- TFIDF Cos

In [8]:
group_wrong_dic = {}
group_accuracy_dic = {}

for real,predict in zip(news_group_test.target,tfidf_cos_predict):
    if(real not in group_wrong_dic.keys()):
        group_wrong_dic[real] = list()
    if(real == predict):
        if(real not in group_accuracy_dic.keys()):
            group_accuracy_dic[real] = 0
        group_accuracy_dic[real] += 1
    else:
        group_wrong_dic[real].append(predict)

for key in group_accuracy_dic.keys():
    print("The accuracy of group " + news_group_target_names[key] + "is " + str(group_accuracy_dic[key] / news_group_test.target.tolist().count(key)))
    c = Counter(group_wrong_dic[key])
    group_index = c.most_common(1)[0][0]
    print("The most common mistake for this group is " + news_group_target_names[group_index])
    print("-------------------------------------------------------------------------")

The accuracy of group alt.atheismis 0.7429467084639498
The most common mistake for this group is talk.religion.misc
-------------------------------------------------------------------------
The accuracy of group sci.medis 0.5732323232323232
The most common mistake for this group is talk.politics.misc
-------------------------------------------------------------------------
The accuracy of group soc.religion.christianis 0.7688442211055276
The most common mistake for this group is alt.atheism
-------------------------------------------------------------------------
The accuracy of group comp.windows.xis 0.5822784810126582
The most common mistake for this group is comp.os.ms-windows.misc
-------------------------------------------------------------------------
The accuracy of group comp.graphicsis 0.5372750642673522
The most common mistake for this group is comp.windows.x
-------------------------------------------------------------------------
The accuracy of group talk.politics.mideasti