In [3]:
from urllib.request import urlopen, Request
from lxml import html
import pandas as pd
from datetime import datetime
import re
import concurrent.futures
from bs4 import BeautifulSoup
import requests

In [4]:
headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 \
    (KHTML, like Gecko) Chrome/41.0.2228.0 Safari/537.3'}

In [5]:
def page_content_print_time(thread_id):
    %time df_posts, df_users = page_content(thread_id)
    return df_posts, df_users

In [6]:
def thread_ids(pages, verbose=True):
    print('Fetching pages...')
    count = 0
    base_main_url = 'https://www.fxp.co.il/forumdisplay.php?f=46&page=%d'
    thread_ids = []
    for i in pages:
        main_url = base_main_url % i
        response = requests.get(main_url)
        contents = response.content.decode("utf-8")
        response.close()
        currenct_thread_ids = re.findall("showthread\.php\?t=(.*)\" id=", contents)
        thread_ids +=  [int(x) for x in currenct_thread_ids]
        count += 1
        if verbose and (count % 10 == 0):
            print('Fetched %s pages' % (count))
            print(datetime.now().time())
    thread_ids.remove(12069815)
    return list(set(thread_ids))


In [7]:
def page_content(thread_id):
    df_thread_new = pd.DataFrame(columns=['thread_id', 'title'])
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    page = 1
    url='https://www.fxp.co.il/showthread.php?t=%s' % (thread_id)
    response_page = None
    while page == 1 or not response_page.history:
        if page > 1:
            url='https://www.fxp.co.il/showthread.php?t=%s&page=%s' % (thread_id,str(page))
        try:
            response_page = requests.get(url)
            content_page = response_page.content.decode("utf-8")
        except:
            import time
            time.sleep(60)
            response_page = requests.get(url)
            content_page = response_page.content.decode("utf-8")
        if not response_page.history:
            if page == 1:
                soup = BeautifulSoup(content_page, 'lxml')
                title = title = soup.title.getText()
                df_thread_new = df_thread_new.append({'thread_id':thread_id, 'title':title}, ignore_index=True)
            df_post_new_, df_user_new_ = thread_single_page_content(thread_id, content_page, page)
            df_post_new = pd.concat([df_post_new,df_post_new_])
            df_user_new = pd.concat([df_user_new,df_user_new_])
        page += 1
    return df_thread_new, df_post_new, df_user_new

In [8]:
def thread_single_page_content(thread_id, contents, page):
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    soup = BeautifulSoup(contents, 'lxml')
    posts = soup.find_all(id=re.compile('^post_[0-9].*'))
    for post in posts:
        post_id = post.get('id').replace('_','')
        try:
            user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("span" , recursive=True)[0].getText()
        except:
            user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("strong" , recursive=True)[0].getText()
            #post_counter = post.find('a', {'class': 'postcounter'}).getText()
            #print(f'Error in thread: {thread_id} ,page: {page} in {message} {post_counter}')
        message_date = post.find('span', {'class': 'date'}).getText()
        cites = post.find_all('div', {'class': 'bbcode_quote'})
        cited_post = []
        for cite in cites:
            for c in cites:
                try:
                    link = c.find('a')['href']
                    cited_post.append(link[link.find('#')+1:])
                except:
                    cited_post.append('custom cites')
        user_details = post.find('dl', {'class': 'userstats-new'}).find_all('dd')
        user_reg_date = user_details[0].getText()
        user_message_count = user_details[1].getText()
        try:
            signiture_text = post.find('blockquote', {'class': 'signature restore'}).getText()
        except:
            signiture_text = ''
        for cite in cites:
            cite.extract()
        message = post.find('blockquote', {'class': 'postcontent restore'}).getText().replace('\n','.').strip()
        df_post_new = df_post_new.append({'thread_id': thread_id, 'post_id': post_id,'user_name': user_name,'date':message_date,
                            'message': message,'cite1':cited_post[0] if len(cites) > 0 else '',
                            'cite2': cited_post[1] if len(cites) > 1 else '', 
                            'cite3': cited_post[2] if len(cites) > 2 else '',
                            'cite4': cited_post[3] if len(cites) > 3 else ''}, ignore_index=True)
        df_user_new = df_user_new.append({'user_name': user_name, 'register_date': user_reg_date, 'message_count': user_message_count, 'signiture_text': signiture_text}, ignore_index=True)
    return df_post_new, df_user_new

In [9]:
def thread_content(thread_ids, thread_file='thread.csv', post_file='post.csv', user_file='user.csv',verbose=True):
    print('Fetching threads...')
    count = 0
    df_thread_new = pd.DataFrame(columns=['thread_id', 'title','type'])
    df_post_new = pd.DataFrame(columns=['thread_id', 'post_id','user_name','date','message',
                                        'cite1','cite2','cite3','cite4'])
    df_user_new = pd.DataFrame(columns=['user_name', 'register_date','message_count','signiture_text'])
    with concurrent.futures.ThreadPoolExecutor(max_workers=None) as executor:
        future_to_url = {executor.submit(page_content, thread_id): thread_id for thread_id in thread_ids}
        print(f'finish to submit all jobs')
        df = pd.DataFrame(columns=['thread', 'post'])
        for future in concurrent.futures.as_completed(future_to_url):
            df_thread_new_, df_post_new_, df_user_new_=future.result()
            df_thread_new = pd.concat([df_thread_new,df_thread_new_])
            df_post_new = pd.concat([df_post_new,df_post_new_])
            df_user_new = pd.concat([df_user_new,df_user_new_])
    df_thread_new.to_csv(thread_file, encoding='utf-8')        
    df_post_new.to_csv(post_file, encoding='utf-8')        
    df_user_new.to_csv(user_file, encoding='utf-8')

In [10]:
number_thread_pages = 1000

In [9]:
%time thread_content(thread_ids(range(1, number_thread_pages)))

Fetching pages...
Fetched 10 pages
22:54:45.222893
Fetched 20 pages
22:54:53.764558
Fetched 30 pages
22:55:02.627425
Fetched 40 pages
22:55:10.166106
Fetched 50 pages
22:55:18.640440
Fetched 60 pages
22:55:26.946924
Fetched 70 pages
22:55:35.714025
Fetched 80 pages
22:55:43.904599
Fetched 90 pages
22:55:51.524925
Fetched 100 pages
22:55:58.963529
Fetched 110 pages
22:56:06.007015
Fetched 120 pages
22:56:14.176716
Fetched 130 pages
22:56:22.719196
Fetched 140 pages
22:56:30.740535
Fetched 150 pages
22:56:38.644991
Fetched 160 pages
22:56:46.627496
Fetched 170 pages
22:56:54.902062
Fetched 180 pages
22:57:03.385421
Fetched 190 pages
22:57:12.415027
Fetched 200 pages
22:57:20.983434
Fetched 210 pages
22:57:29.198143
Fetched 220 pages
22:57:38.300528
Fetched 230 pages
22:57:46.786348
Fetched 240 pages
22:57:55.851795
Fetched 250 pages
22:58:05.669401
Fetched 260 pages
22:58:13.982748
Fetched 270 pages
22:58:23.012830
Fetched 280 pages
22:58:30.319686
Fetched 290 pages
22:58:38.739219
Fetch

In [10]:
df = pd.read_csv('post.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [11]:
df.shape

(305360, 10)

In [12]:
dup = pd.concat(g for _, g in df.groupby(["thread_id",'date','message']) if len(g) > 1).sort_index()

In [13]:
dup.head(n=30)

Unnamed: 0.1,Unnamed: 0,thread_id,post_id,user_name,date,message,cite1,cite2,cite3,cite4
1264,4,19923700,post199966141,Fleischer,16-10-2019 15:30,...תודה!.,,,,
1265,5,19923700,post199966148,Fleischer,16-10-2019 15:30,...תודה!.,,,,
3798,0,19793901,post197773743,Almog_1998,01-08-2019 19:59,.חחחחח איזה דוח איזה פשוט אל תזדהה לו או שתזדה...,,,,
3799,1,19793901,post197773752,Almog_1998,01-08-2019 19:59,.חחחחח איזה דוח איזה פשוט אל תזדהה לו או שתזדה...,,,,
6987,1,20188785,post203545431,yoav2000,28-03-2020 21:59,.תתבגר.,,,,
6988,2,20188785,post203545436,YeledKappa,28-03-2020 21:59,.תתבגר.,,,,
7098,5,19795721,post197806290,GamerDream,02-08-2019 17:42,...תודה.,,,,
7099,6,19795721,post197806292,GamerDream,02-08-2019 17:42,...תודה.,,,,
8135,10,19927422,post200026859,I.love.music,18-10-2019 17:17,...תודה😇.,,,,
8136,11,19927422,post200026863,I.love.music,18-10-2019 17:17,...תודה😇.,,,,


In [14]:
df.loc[df['thread_id'] == '20279392'].sort_values(['thread_id','date'])

Unnamed: 0.1,Unnamed: 0,thread_id,post_id,user_name,date,message,cite1,cite2,cite3,cite4


In [15]:
dup

Unnamed: 0.1,Unnamed: 0,thread_id,post_id,user_name,date,message,cite1,cite2,cite3,cite4
1264,4,19923700,post199966141,Fleischer,16-10-2019 15:30,...תודה!.,,,,
1265,5,19923700,post199966148,Fleischer,16-10-2019 15:30,...תודה!.,,,,
3798,0,19793901,post197773743,Almog_1998,01-08-2019 19:59,.חחחחח איזה דוח איזה פשוט אל תזדהה לו או שתזדה...,,,,
3799,1,19793901,post197773752,Almog_1998,01-08-2019 19:59,.חחחחח איזה דוח איזה פשוט אל תזדהה לו או שתזדה...,,,,
6987,1,20188785,post203545431,yoav2000,28-03-2020 21:59,.תתבגר.,,,,
...,...,...,...,...,...,...,...,...,...,...
301369,12,2.00519e+07,post202109317,givati1,10-01-2020 14:32,...מוזמן לקבל בהודעה פרטית אבל האשכול ימחק תוך...,,,,
302401,10,2.01831e+07,post203479796,Stack_Over_Flow,26-03-2020 01:26,...תודה רבה.,,,,
302402,11,2.01831e+07,post203479804,Stack_Over_Flow,26-03-2020 01:26,...תודה רבה.,,,,
303572,10,1.97905e+07,post197726604,Image501,31-07-2019 11:40,....Your browser does not support the audio el...,,,,


In [16]:
print((0.5*dup.shape[0])/df.shape[0])

0.00038806654440660205


In [11]:
thread_id_to_check = 20185710

In [12]:
page_content(thread_id_to_check)[0].

(  thread_id                                         title
 0  20185710  חל"ת עד גיל 28 מקבלים \t​% 48 מהמשכורת - FXP,
   thread_id        post_id    user_name              date  \
 0  20185710  post203512185       _nero_  27-03-2020 05:19   
 1  20185710  post203512413  rotem123451  27-03-2020 06:17   
 
                                              message cite1 cite2 cite3 cite4  
 0  .כותרת... השאלה היא כזאת אני ב10 באפריל יהיה ב...                          
 1                    .וואו עצוב בגיל הזה עדיין בכספ.                          ,
      user_name register_date message_count  \
 0       _nero_      23-06-09          nero   
 1  rotem123451      13-03-20           370   
 
                                       signiture_text  
 0  \nMy Pc\n I7-8700k 5.0GHZ,1.340V \npalit gtx 1...  
 1                                                     )

In [19]:
url='https://www.fxp.co.il/showthread.php?t=%s' % (thread_id_to_check)
response_page = requests.get(url)
content_page = response_page.content.decode("utf-8")
soup = BeautifulSoup(content_page, 'lxml')
count = 1
posts = soup.find_all(id=re.compile('^post_[0-9].*'))
for post in posts:
    print(count)
    count += 1
    post_id = post.get('id').replace('_','')
    try:
        user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("span" , recursive=True)[0].getText()
    except:
        user_name = post.find('a', {'class': re.compile('username .* popupctrl')}).findChildren("strong" , recursive=True)[0].getText()
        #post_counter = post.find('a', {'class': 'postcounter'}).getText()
        #print(f'Error in thread: {thread_id} ,page: {page} in {message} {post_counter}')
    message_date = post.find('span', {'class': 'date'}).getText()
    cites = post.find_all('div', {'class': 'bbcode_quote'})
    cited_post = []
    for cite in cites:
        for c in cites:
            try:
                link = c.find('a')['href']
                cited_post.append(link[link.find('#')+1:])
            except:
                cited_post.append('custom cites')
    user_details = post.find('dl', {'class': 'userstats-new'}).find_all('dd')
    user_reg_date = user_details[0].getText()
    user_message_count = user_details[1].getText()
    try:
        signiture_text = post.find('blockquote', {'class': 'signature restore'}).getText()
    except:
        signiture_text = ''
    for cite in cites:
        cite.extract()
    message = post.find('blockquote', {'class': 'postcontent restore'}).getText().replace('\n','.').strip()
    print(message)

1
.אני אמור להתחיל קורס נהג משא יש לי קהס 43 תש 4 איך אני יכול לצאת מנהג משא עוד כמה ימים קורס מתחיל בבקשה עזרה יש לי קהס 43 תש 4.
2
.אני בבהד 6 עיר הבדים.
3
.מישהו יודע מה זה חייל מיועד רצף?.
4
...זה חייל שהוא מיועד רצף.דקך אגב מה הכהס והתש שלך.
5
.חחחחחחחחח מה אתה גנוב איזה נהג משא..עם ת"ש 4 וקה"ס 43 אתה צריך לקבל ג'וב של 3 שעות ליד הבית..צא צא לנפקדות כמה שיותר מהר אחי.
6
...מה זה רצף קהס 43 ותש 4.
7
...אבל אני אכנס לכלא לא ? אני מת לצאת מנהג משא.
8
...גם אם תכנס אתה חייב לפסול את החרא הזה, תחכה שהקורס יתחיל ושהם לא יוכלו לדחוף אותך באמצע אליו..הם לא ישאירו אותך למחזור הבא, ואם כן מקסימום עוד כליאה, עד שתיפלט לקצין מיון..כלא זה שטויות אחי תאמין לי שעדיף שתסבול קצת משתסבול כל השירות שלך וידחפו לך עוד מילואים לחרא הזה.
9
...מה פירוש של חייל מיועד רצף לא הבנתי את פירוש של זה .
10
...תלוי כמה אתה לא רוצה להיות נהג משא אל תסכים עד שיוציאו אותך ושלא יגידו לך תסיים את הקורס ואז נדבר פשוט אל תסכים.
11
.אני בנתיים רק הוצאתי טופס ירוק רופא לא חתם עליו עדיין אני ממש לא יודע מה לעשות כאילו טופס

In [20]:
page = 1
url='https://www.fxp.co.il/showthread.php?t=%s' % (thread_id_to_check)
response_page = None
while page == 1 or not response_page.history:
    if page > 1:
        url='https://www.fxp.co.il/showthread.php?t=%s&page=%s' % (thread_id_to_check,str(page))
    response_page = requests.get(url)
    content_page = response_page.content.decode("utf-8")
    if page == 1:
        soup = BeautifulSoup(content_page, 'lxml')
        title = title = soup.title.getText()
    print(thread_single_page_content(thread_id_to_check, content_page, page))
    page += 1

(   thread_id        post_id     user_name              date  \
0   20279392  post204573951       new9214  09-05-2020 14:44   
1   20279392  post204573964       new9214  09-05-2020 14:44   
2   20279392  post204573984       new9214  09-05-2020 14:45   
3   20279392  post204574144  Vaginaterian  09-05-2020 14:52   
4   20279392  post204574198       For4ver  09-05-2020 14:55   
5   20279392  post204574947       new9214  09-05-2020 15:39   
6   20279392  post204574955       new9214  09-05-2020 15:39   
7   20279392  post204574988       For4ver  09-05-2020 15:41   
8   20279392  post204575020       new9214  09-05-2020 15:43   
9   20279392  post204575030    ilantal321  09-05-2020 15:43   
10  20279392  post204575038       new9214  09-05-2020 15:44   
11  20279392  post204575060       For4ver  09-05-2020 15:45   
12  20279392  post204575162    ilantal321  09-05-2020 15:51   
13  20279392  post204575190    ilantal321  09-05-2020 15:52   
14  20279392  post204575246       new9214  09-05-2020 

In [21]:
def send_email(user='dsakaidf@gmail.com', pwd='d54k4idf!', recipient='shkasta@post.bgu.ac.il',
               subject='finish expirement', body='finish the expirement'):
    import smtplib

    gmail_user = user
    gmail_pwd = pwd
    FROM = user
    TO = recipient if type(recipient) is list else [recipient]
    SUBJECT = subject
    TEXT = body

    # Prepare actual message
    message = """From: %s\nTo: %s\nSubject: %s\n\n%s
    """ % (FROM, ", ".join(TO), SUBJECT, TEXT)
    try:
        # SMTP_SSL Example
        server_ssl = smtplib.SMTP_SSL("smtp.gmail.com", 465)
        server_ssl.ehlo()  # optional, called by login()
        server_ssl.login(gmail_user, gmail_pwd)
        # ssl server doesn't support or need tls, so don't call server_ssl.starttls()
        server_ssl.sendmail(FROM, TO, message)
        # server_ssl.quit()
        server_ssl.close()
        print('successfully sent the mail')
    except:
        print("failed to send mail")

In [22]:
send_email()

successfully sent the mail
