# regular expressions, and text processing. 

## 1. Regular Expressions 


**regular expression that extracts the urls out of this string.** 

In [30]:
text = """To learn about pros/cons of data science, go to http://datascience.net.\
Alternatively, go to datascience.net/2020/ """
print(text)

To learn about pros/cons of data science, go to http://datascience.net.Alternatively, go to datascience.net/2020/ 


In [31]:
import re
url_reg=r'[(?:https?|ftp)+:\/\/]*[\w/./-]*\.[\/a-z\d\-]+'
match_url=re.findall(url_reg,text)
if len(match_url)==0:
    print("no url has found")
else :
    for url in match_url:
        print(url)
    

http://datascience.net
datascience.net/2020/


**regular expression that extracts all phone numbers and fax numbers from text.**

In [32]:
text = """You can reach me at 054-434-4321, or my office at (03) 502 9571 or (050) 223 957.\ 
Send me a fax at 03 502 7422. We finally made the sale for all 977 giraffes.\
They wanted 225 957 dollars for it."""

In [33]:
num_reg=r'[()\d]+[\-\s]\d+[\-\s]\d+|\d{10}|\d{9}'
match_nums=re.findall(num_reg,text)
if len(match_nums)==0:
    print("no number has found")
else :
    for num in match_nums:
        print(num)

054-434-4321
(03) 502 9571
(050) 223 957
03 502 7422


**regular expression that extracts all opening html tags.** 

In [34]:
html = "This is <b>important</b> and <u>very</u><i>timely</i><br />. Was this <span> what you meant?</span>"

In [35]:
tag_reg=r'<\w*\W*>'
match_tags=re.findall(tag_reg,html)
if len(match_nums)==0:
    print("no tags has found")
else :
    for tag in match_tags:
        print(tag)

<b>
<u>
<i>
<br />
<span>


**regular expression that extracts all the names of people**

In [36]:
text = """Arnold Schwarzenegger was born in Austria. He and Sylvester Stalone used to run a restaurant\
with J. Edgar Hoover."""

In [37]:
name_reg=r'(?:[A-Z]\.?\w*\s[A-Z]\w*)(?:\s[A-Z]\w*)?'
match_name=re.findall(name_reg,text)
if len(match_name)==0:
    print("no tags has found")
else :
    for name in match_name:
        print(name)

Arnold Schwarzenegger
Sylvester Stalone
J. Edgar Hoover


**regular expression that extracts the text out of all html elements of class important.**

In [38]:
text = """Lorem ipsum dolor <b>sit</b> amet, <b class="important">consectetur adipiscing</b> elit,\ 
sed do eiusmod <span id="note">tempor incididunt ut</span> <div>labore <strong class="important">\
et dolore magna</strong> aliqua.</div> Ut enim ad minim veniam, quis nostrud exercitation ullamco."""
print(text)

Lorem ipsum dolor <b>sit</b> amet, <b class="important">consectetur adipiscing</b> elit,\ 
sed do eiusmod <span id="note">tempor incididunt ut</span> <div>labore <strong class="important">et dolore magna</strong> aliqua.</div> Ut enim ad minim veniam, quis nostrud exercitation ullamco.


In [39]:
htmltext_reg=r'<\w*\sclass="important">(.+?)<\/\w*>'
match_text=re.findall(htmltext_reg,text)
if len(match_text)==0:
    print("no tags has found")
else :
    for text in match_text:
        print(text)

consectetur adipiscing
et dolore magna


## 2. Text processing: Comparing Netanyahu's and Gantz's Facebook Posts

steps we follow are:
1. Load the data set (you can use read_excel or other pandas functions)   
+ Do preprocessing: (i.e punctuation removal, removal of english phrases, tokenization)
+ Analyze word frequencies per candidate
+ Find main differences between them (try to ignore function words)
+ Try to look for other features (i.e text len, use of emoji's, etc), and look for differences
+ Print a Wordcloud for each candidate with main words
+ Evaluate the results, and write your insights



In [2]:
#first we need to import all useful lib
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk import FreqDist
from nltk.text import Text

**Load the dataset and Print the exact number of posts per candidate in the corpus.**

In [3]:
#reading the excel data into pandas df
df=pd.read_excel('election_posts.xlsx')

#extrect the posts freq for each candidate using nltk
posts_amount=FreqDist(df['PageName'])#save frequency of posts into varible
df_postamount=pd.DataFrame(list(posts_amount.items()), columns = ["Candidate","Amount of posts"])#disply

df_postamount

Unnamed: 0,Candidate,Amount of posts
0,Benjamin Netanyahu,186
1,Benny Gantz,168


**Preprocess the text data. In particular remove punctuations, remove english words, and tokenize the posts into token vectors (one vector for each post)**

In [42]:
tkn_Reg = RegexpTokenizer(r'[א-ת]+')#regex to read only 
token_lst=[]#list for all token vectors

def reg_token(post):#fun that will get a post and tokenize it
    tmp=re.sub(r'[\’\"\״]', "",post)#clean all typs of quote marks from the text
    clean_post=tkn_Reg.tokenize(tmp)#tokenize the post 
    return clean_post#return the post clean and tokenize

for index,row in df.iterrows():#loop over dataframe
    tkn_Vec=(row.PageName,reg_token(row.Data))#create a vector(tuple) from candidate name and post
    token_lst.append(tkn_Vec)#append each vector to list


**Analyze the total word frequencies, and frequencies of words per candidate( also from a relative perspective).** 

In [45]:
total_words=[]#save all post words here
gantz_words=[]#save all ganz words here
bibi_words=[]#save all bibi words here

for i in token_lst:#iterate on all vectors to extract
    for word in i[1]:#iterate over token element on each tuple
        total_words.append(word)#add words to var
        if i[0]=='Benjamin Netanyahu':#check if its bibi post
            bibi_words.append(word)#if yes append it to bibi var
        else:#else do the same for ganz    
            gantz_words.append(word)

#find the frequancy of words 
total_freq=FreqDist(total_words)#total words freq
gantz_freq=FreqDist(gantz_words)#ganz words freq
bibi_freq=FreqDist(bibi_words)#bibi words freq

bibi_anlz_df=pd.DataFrame(list(bibi_freq.items()), columns = ["Word","Frequency"])#transfer into df
gantz_anlz_df=pd.DataFrame(list(gantz_freq.items()), columns = ["Word","Frequency"])#transfer into df

#create a col with the relative freq of each word
gantz_anlz_df['Relative frequency(%)']=(gantz_anlz_df['Frequency']/len(gantz_anlz_df))*100
bibi_anlz_df['Relative frequency(%)']=(bibi_anlz_df['Frequency']/len(bibi_anlz_df))*100
#sort the words, so the most useful words will be on top
gantz_anlz_df = gantz_anlz_df.sort_values(by=['Relative frequency(%)'], ascending=False)
bibi_anlz_df = bibi_anlz_df.sort_values(by=['Relative frequency(%)'], ascending=False)

print(bibi_anlz_df[:20])
print()
print(gantz_anlz_df[:20])

       Word  Frequency  Relative frequency(%)
19       את        210               5.857741
33       של        131               3.654114
15       על        129               3.598326
9        לא        100               2.789400
5     ישראל         62               1.729428
134     אני         57               1.589958
101      עם         53               1.478382
95      גנץ         53               1.478382
48       זה         51               1.422594
127      כל         49               1.366806
330  הממשלה         48               1.338912
10   נתניהו         47               1.311018
183     ראש         43               1.199442
288   ממשלת         37               1.032078
340     כדי         36               1.004184
29   הליכוד         33               0.920502
42       אם         33               0.920502
14      הוא         32               0.892608
192      רק         30               0.836820
41   להצביע         26               0.725244

       Word  Frequency  Relative 

**find the main differences between the language and posts that each candidate uses**

In [15]:
bibi_20=bibi_anlz_df[:20]
gantz_20=gantz_anlz_df[:20]
print(bibi_20)
print(gantz_20)
#bibi_t.collocations()
#gantz_t.concordance()

       Word  Frequency  Relative frequency(%)
19       את        210               5.857741
33       של        131               3.654114
15       על        129               3.598326
9        לא        100               2.789400
5     ישראל         62               1.729428
134     אני         57               1.589958
101      עם         53               1.478382
95      גנץ         53               1.478382
48       זה         51               1.422594
127      כל         49               1.366806
330  הממשלה         48               1.338912
10   נתניהו         47               1.311018
183     ראש         43               1.199442
288   ממשלת         37               1.032078
340     כדי         36               1.004184
29   הליכוד         33               0.920502
42       אם         33               0.920502
14      הוא         32               0.892608
192      רק         30               0.836820
41   להצביע         26               0.725244
       Word  Frequency  Relative f

**Interpretation:** as we see from prev stats , bibi have more post and less words comparing to gantz its posts are shorter.
words analyze:
gantz talk about 'israel' 3.7% and bibi 1.7%
bibi remaind gantz 1.47% and gantz remind bibi 2.37%
gantz talk about himself more than bibi 2.03 compare 1.53 to the word 'אני
bibi talk about votes and his faction more than gantz.


 **interesting features that can show differences between the candidates (features such as post length, emoji's)**

In [28]:
bibi_len=[]
gantz_len=[]
#avg post len per candidate
for i,row in df.iterrows():
    if row.PageName=='Benjamin Netanyahu':
        bibi_len.append(len(row.Data))
    else:
        gantz_len.append(len(row.Data))
#average cacl        
def Average(lst): 
    return sum(lst) / len(lst) 

bibi_average = Average(bibi_len)
gantz_average=Average(gantz_len)
print("Average len of bibi posts =", round(bibi_average, 2)) 
print("Average len of gantz posts =", round(gantz_average, 2))
print()

re_spcl = re.compile(r'[\u263a-\U0001f645\!\?]')#regex for emojis
bibi_spcl=[]
gantz_spcl=[]

#iterate over df to create emoji list for each candidate
for index,row in df.iterrows():#loop over dataframe
    if row.PageName=='Benjamin Netanyahu':
        bibi_spcl.append(re_spcl.findall(row.Data))
    else:
        gantz_spcl.append(re_spcl.findall(row.Data))

#extend the lists
bibi_res=[]
[bibi_res.extend(el) for el in bibi_spcl]
gantz_res=[]
[gantz_res.extend(el) for el in gantz_spcl]
#check freq of emojis
bibimoji_freq=FreqDist(bibi_res)
gantzmoji_freq=FreqDist(gantz_res)

bibi_moji_df=pd.DataFrame(list(bibimoji_freq.items()), columns = ["Emoji","Frequency"])#transfer into df
gantz_moji_df=pd.DataFrame(list(gantzmoji_freq.items()), columns = ["Emoji","Frequency"])
print(bibi_moji_df)
print(gantz_moji_df)

Average len of bibi posts = 260.11
Average len of gantz posts = 488.71

  Emoji  Frequency
0     !         82
1     ❤          4
2     ️         10
3     ?         27
4     🇮         15
5     🇱         15
6     👇          3
7     🇺          1
8     🇸          1
9     ♥          6
  Emoji  Frequency
0     !         31
1     ?         19
2     ￼          1


**Interpretation:** 
we analyzed 3 things:
<br>
1)according to last stats bibi have 186 post and gantz 168 post.
<br>
2)as we analyze avarege of posts length's bibi post are almost twice shorter than gantz posts.
<br>
3)after analyzing emoji's and !\? , we see that bibi use much more emojis than gantz

**wordcloud for each candidate**

In [None]:
from wordcloud import WordCloud, STOPWORDS
from PIL import Image
import numpy as np
#import urllib
import requests
import matplotlib
import matplotlib.pyplot as plt
from bidi.algorithm import get_display

stopwords = set(STOPWORDS)
stopwords.add("said")

#fixing and prepering text for word cloud
textbibi = get_display(str(' '.join(bibi_words)))
textgantz= get_display(str(' '.join(gantz_words)))

#reading masks for web
gantz_mask=np.array(Image.open(requests.get('https://cdn.imgbin.com/21/19/17/imgbin-jerusalem-flag-of-israel-magen-david-adom-judaism-fcexYfA8iT3EMgTg0UBWxmqqM.jpg', stream=True).raw))
bibi_mask = np.array(Image.open(requests.get('http://clipart-library.com/images_k/star-of-david-silhouette/star-of-david-silhouette-7.png', stream=True).raw))

#funcation that generate wordcloud 
def gen_wordcloud(words, mask):
    word_cloud = WordCloud(font_path = 'C:\Windows\Fonts\courbd.ttf',max_font_size=300,background_color="white", max_words=3000, mask=mask, stopwords=stopwords).generate_from_text(words)
    plt.figure(figsize=(9,7))
    plt.imshow(word_cloud,cmap=plt.cm.gray,interpolation='bilinear')
    plt.axis('off')
    plt.show()
    
gen_wordcloud(textbibi,bibi_mask)  
gen_wordcloud(textgantz,gantz_mask) 

**Interpretation:** to sum up , gantz talk more about israel and bibi with longer posts compare to bibi and about what his going to do diffrent compare to bibi
bibi talk more about votes and also about himself (1.37% to the word "נתניהו") with shorter posts (his talk more 'תכלס')