In [None]:
import requests
import json
from pymongo import MongoClient
import time
import re
from pprint import pprint
import pprint

### connect to MongoDB

In [None]:
# connect to Mongo;  Port number Default: 27017; you have ot have MongoDB installed
client = MongoClient('localhost', 27017)
# Switch to Existing Database: sahamyab Database exists in MongoDB
db = client['sahamyab']
# Switch to existing collection; tweets collection exists in MongoDB database
series_collection = db['tweets']


### download 1000 tweets from sahamyab and save into MongoDB

In [None]:
start_time = time.time() 
# download 1000 tweets from sahamyab
while series_collection.count() < 1000:
    # Getting last 10 twiits
    response = requests.get('https://www.sahamyab.com/guest/twiter/list?v=0.1', headers={'User-Agent' : 'chrome/61'})
    data = json.loads(response.text)
    twiit_10 = data['items']
    
    # cheching if twitt is not in collection
    for element in twiit_10 :
        if series_collection.find_one(element) == None:
            # insert data into mongodb
            result = series_collection.insert_one(element)
        
    time.sleep(60 - time.time() % 60)
    
end_time = time.time()
delta_time = end_time - start_time


### The replacement of arabic characters with Persian characters

replacing 'ي' with 'ی' and 'ك' with 'ک'

In [None]:
start_time = time.time()
for document in series_collection.find({'content':{'$regex':'.*ك.*|.*ي.*'}}):
    new_dict = document.copy()
    # replacing 'ي' with 'ی' and 'ك' with 'ک'
    new_dict['content'] = re.sub('ي', 'ی', new_dict['content'])
    new_dict['content'] = re.sub('ك', 'ک', new_dict['content'])
    db.tweets.update(document, new_dict)  

end_time = time.time()
delta_time = end_time - start_time 
print(delta_time)

### Hashtags are extracted from the field named content

In [None]:
start_time = time.time() 
for document in series_collection.find({'content':{'$regex':'.*#.*'}}): 
    hashtags_sharp = re.findall('#\w+', document['content'])
    hashtags = [o.split('#')[1] for o in hashtags_sharp]  #removing shashtag sign
    new_dict = {"hashtags": hashtags }
    db.tweets.update(document, {"$set": new_dict })  

end_time = time.time()
delta_time = end_time - start_time
print(delta_time)

### Identify users whose mediaContentType is image/jpeg and whose parentId has value

#### without using mongoDB index

In [None]:
start_time = time.time() 
res = series_collection.find({
    '$and':[
              {'mediaContentType':'image/jpeg'}, {'parentId':{ '$exists': True } } 
           ]}, {"senderName": 1})


end_time = time.time()
delta_time = end_time - start_time
print(delta_time)

lis=[]
for i in res:
    lis.append(i['senderName'])
    pprint.pprint(i)

#### using mongoDB index

In [None]:
### Adding index to field
series_collection.create_index([("mediaContentType", pymongo.DESCENDING), ("parentId", pymongo.DESCENDING)])

start_time = time.time() 
res = series_collection.find({
    '$and':[
              {'mediaContentType':'image/jpeg'}, {'parentId':{ '$exists': True } } 
           ]}, {"senderName": 1, '_id':0})

end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)

lis = []
for i in res:
    lis.append(i['senderName'])
    pprint.pprint(i)

### add new fied named gov to the tweets which containing the hashtag of 'شبدر', 'شستا' and  'فولاد'

#### without using mongoDB index

In [None]:
start_time = time.time() 
series_collection.update_many(
              {'hashtags':{'$in': ['فولاد', 'شستا', 'شبندر'] }},
              {'$set':{'gov': True }})

end_time = time.time()
delta_time = end_time - start_time
print(delta_time)

#### using mongoDb index

In [None]:
### Adding index to field
series_collection.create_index(
        [("hashtags", pymongo.DESCENDING)])

start_time = time.time() 
series_collection.update_many(
   {'hashtags':{'$in': ['فولاد', 'شستا', 'شبندر'] }},
   {'$set':{'gov': True }})
  
end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)

### Find the senderName and senderProfileImage of users who tweet between 13:00 and 14:00

In [None]:
res = series_collection.find(
    {'sendTimePersian':{'$regex':'.* 13:.*'} }, 
    {'senderName':1, 'senderProfileImage':1, '_id':0})

end_time = time.time()
delta_time = end_time - start_time
print(delta_time)

twitt_time_intvrl = []
for i in res:
    twitt_time_intvrl.append([i['senderName'],
                            i['senderProfileImage']])
    pprint.pprint(i)

### find number of users in each group:

1) users with one tweet

2)  users with two or three tweets

3) users with more than three tweets

#### without using mongoDB index

In [None]:
#### finding users with one tweet:
count_tweet_of_each_ID = {'$group' :{'_id' : "$senderUsername",
                                     'twitt_number': { '$sum': 1 }}}
match_1 = {'$match': { "twitt_number": { '$eq': 1} }}
project = {'$project' : { "twitt_number":1, '_id': 0}}
                    
res_one_tweet = series_collection.aggregate(
     [count_tweet_of_each_ID, match_1, project])

### find users with 2&3 tweets:
count_tweet_of_each_ID = {'$group' :{'_id' : "$senderUsername",
          'twitt_number': { '$sum': 1 }}}
match_2_3 = {'$match': { "twitt_number": { '$gte': 2, '$lte': 3 } }}
                    
res_2_3_tweets = series_collection.aggregate(
     [count_tweet_of_each_ID, match_2_3, project])

### find users with more than 3 tweets:
count_tweet_of_each_ID = {'$group' :{'_id' : "$senderUsername",
          'twitt_number': { '$sum': 1 }}}
match_gtr_4 = {'$match': { "twitt_number": { '$gt': 3} }}
                    
res_more_than_3 = series_collection.aggregate(
     [count_tweet_of_each_ID, match_gtr_4, project])
   
end_time = time.time()
delta_time = end_time - start_time
print('run time :', delta_time)
print(f'Number of one-tweet-user: {len(list(res_one_tweet))}')
print(f'Number of two and three-tweet-user: {len(list(res_2_3_tweets))}')
print(f'Number of more than three-tweets-user: {len(list(res_more_than_3))}')

#### using mongodb index

In [None]:
### Adding index to field
series_collection.create_index(
        [("senderUsername", pymongo.DESCENDING)])
start_time = time.time() 
#### finding users with one tweet:
count_tweet_of_each_ID = {'$group' :
    {'_id' : "$senderUsername",'twitt_number': { '$sum': 1 }}}
match_gtr_4 = {'$match': { "twitt_number": { '$eq': 1 } }}
project = {'$project' : { "twitt_number":1, '_id': 0}}
                    
res_one_tweet = series_collection.aggregate(
     [count_tweet_of_each_ID, match_gtr_4, project])
#### finding users with 2&3 tweets:
count_tweet_of_each_ID = {'$group' :{
    '_id' : "$senderUsername",'twitt_number': { '$sum': 1 }}}
match_gtr_4 = {'$match': { "twitt_number": { '$gt': 1, '$lt': 4 } }}
project = {'$project' : { "twitt_number":1, '_id': 0}}
                    
res_2_3_tweets = series_collection.aggregate(
     [count_tweet_of_each_ID, match_gtr_4, project])
#### finding users with more than 3 tweets:
count_tweet_of_each_ID = {'$group' :{
          '_id' : "$senderUsername",
          'twitt_number': { '$sum': 1 }}}
match_gtr_4 = {'$match': { "twitt_number": { '$gte': 4 } }}
project = {'$project' : { "twitt_number":1, '_id': 0}}
                    
res_more_than_3 = series_collection.aggregate(
     [count_tweet_of_each_ID, match_gtr_4, project])
end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)
print(f'Number of users with one tweet: {len(list(res_one_tweet))}')
print(f'Number of users with two and three tweets: {len(list(res_2_3_tweets))}')
print(f'Number of users with more than three tweets: {len(list(res_more_than_3))}')

### Count the number of tweets for each hashtag

In [None]:
start_time = time.time() 
unwind_hashtag_arr = { '$unwind': "$hashtags" }

group_by = {'$group' :{'_id' : "$hashtags",
          'twitt_number': { '$sum': 1 }}}

sort = { '$sort' : { 'twitt_number' : -1 } }
                  
res_one_tweet = series_collection.aggregate(
            [unwind_hashtag_arr, group_by, sort])

### end time
end_time = time.time()
delta_time = end_time - start_time
print('Run time', delta_time)
## printing elements of res_more_than_3
for i in res_one_tweet:
    pprint.pprint(i)  

### For tweets with parentId, remove the field named type

In [None]:
start_time = time.time() 
series_collection.update_many(
              {'parentId':True},
              {'$unset':{'type': '' }})
                              
end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)

### What are the most and least repetitive hastags?

In [None]:
start_time = time.time() 
unwind_hashtag_arr = { '$unwind': "$hashtags" }
group_by = {'$group' :{'_id' : "$hashtags",
                       'twitt_number': { '$sum': 1 }}}
sort_most = { '$sort' : { 'twitt_number' : -1 } }
sort_least = { '$sort' : { 'twitt_number' : 1 } }
lim = { '$limit': 1 }
                 
most_Repetitive = series_collection.aggregate(
     [unwind_hashtag_arr, group_by, sort_most, lim])

least_Repetitive = series_collection.aggregate(
     [unwind_hashtag_arr, group_by, sort_least, lim])
### end time
end_time = time.time()
delta_time = end_time - start_time
print('run time: ', delta_time)
print(f'most Repetitive hashtag: {list(most_Repetitive)}')
print(' -------------------------------------------------------- ')
print(f'least Repetitive hashtag: {list(least_Repetitive)}')


### Each day, find out tne most popular hashtags

#### without using mongoDB index

In [None]:
start_time = time.time() 
#### finding 10-most relevant tweets:
# filter by day number
filter_time = { '$match': {'sendTimePersian':{ '$regex': '.*/19 .*'}}}
### unwind array elements
unwind_hashtag_arr = { '$unwind': "$hashtags" }  
count_tweets = {'$group' :
    {'_id' : "$hashtags",'twitt_number': { '$sum': 1 }}}
### sorting by tweets number
sort = { '$sort' : { 'twitt_number' : -1 } }
### limiting the most ten hashtags
lim = { '$limit': 10 }
                 
ten_most_relevant = series_collection.aggregate(
     [filter_time, unwind_hashtag_arr, count_tweets, sort, lim])

end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)
for i in ten_most_relevant:
    pprint.pprint(i)    

#### using mongoDB index

In [None]:
### Adding index to field
series_collection.create_index(
        [("sendTimePersian", pymongo.DESCENDING),
         ("hashtags", pymongo.DESCENDING)])

start_time = time.time() 
#### finding 10-most relevant tweets:
filter_time = { '$match': {'sendTimePersian':{ '$regex': '.*/14 .*'}}}
### unwind array elements
unwind_hashtag_arr = { '$unwind': "$hashtags" }
count_tweets = {'$group' :{
    '_id' : "$hashtags",'twitt_number': { '$sum': 1 }}}
### sorting by tweets number
sort = { '$sort' : { 'twitt_number' : -1 } }
### limiting the most ten hashtags
lim = { '$limit': 10 }
                   
ten_most_relevant = series_collection.aggregate(
     [filter_time, unwind_hashtag_arr, count_tweets, sort, lim])

end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)
for i in ten_most_relevant:
    pprint.pprint(i)  

### which user is the most active? also include how many tweets he/she sent.

#### without using mongoDB index

In [None]:
start_time = time.time() 
# filter by day number. here I filtered by 14.
filter_time = { '$match': {'sendTimePersian':{ '$regex': '.*/19 .*'}}}
# group by senderUsername and counting each one tweets
count_tweet_of_each_ID = {'$group' :{
          '_id' : "$senderUsername",
          'twitt_number': { '$sum': 1 }}}
### sorting by tweets number
sort = { '$sort' : { 'twitt_number' : -1 } }
### limiting the most ten hashtags
lim = { '$limit': 1 }
project = {'$project' : { "twitt_number":1, 'sendTimePersian':1 }}
                    
res_one_tweet = series_collection.aggregate(
     [filter_time, count_tweet_of_each_ID, sort, lim])

end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)
for i in res_one_tweet:
    pprint.pprint(i) 

#### using mongoDB index

In [None]:
series_collection.create_index(
        [("sendTimePersian", pymongo.DESCENDING)])
start_time = time.time() 
# filter by day number. here I filtered by 14.
filter_time = { '$match': 
    {'sendTimePersian':{ '$regex': '.*/14 .*'}}}
# group by senderUsername and counting each one tweets
count_tweet_of_each_ID = {'$group' :{
    '_id' : "$senderUsername",'twitt_number': { '$sum': 1 }}}
### sorting by tweets number
sort = { '$sort' : { 'twitt_number' : -1 } }
### limiting the most ten hashtags
lim = { '$limit': 1 }
                   
res_one_tweet = series_collection.aggregate(
     [filter_time, count_tweet_of_each_ID, sort, lim])

end_time = time.time()
delta_time = end_time - start_time
print('run time:', delta_time)
for i in res_one_tweet:
    pprint.pprint(i)    
