## group memebers: Nafiseh Mohammadi, Ali Saghi

In [1]:
# Importing very basic libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns; sns.set()
import json

# 1) Reading the Tweets in Python

In [2]:
# Reading the JSON lines in truncated-untruncated format
data = []
with open("intro-to-nlp/english-tweets-sample.jsonl","rt",encoding="utf-8") as f:
    for line in f:
        data.append(json.loads(line))
    
print("Data length:",len(data))
print("Data type:",type(data))
print("First item type:",type(data[0]))
print("First item:",data[0])

Data length: 10000
Data type: <class 'list'>
First item type: <class 'dict'>
First item: {'created_at': 'Tue Dec 26 14:16:22 +0000 2017', 'id': 945659557480611840, 'id_str': '945659557480611840', 'text': 'Check out my class in #GranblueFantasy! https://t.co/pAvXn8diJr', 'display_text_range': [0, 39], 'source': '<a href="http://granbluefantasy.jp/" rel="nofollow">グランブルー ファンタジー</a>', 'truncated': False, 'in_reply_to_status_id': None, 'in_reply_to_status_id_str': None, 'in_reply_to_user_id': None, 'in_reply_to_user_id_str': None, 'in_reply_to_screen_name': None, 'user': {'id': 883980236655779840, 'id_str': '883980236655779840', 'name': 'Pc Kwok', 'screen_name': 'jensenpck', 'location': None, 'url': None, 'description': None, 'translator_type': 'none', 'protected': False, 'verified': False, 'followers_count': 0, 'friends_count': 1, 'listed_count': 0, 'favourites_count': 0, 'statuses_count': 42, 'created_at': 'Sun Jul 09 09:24:46 +0000 2017', 'utc_offset': None, 'time_zone': None, 'geo_enab

In [3]:
# Different fields inside each JSON file
print(data[0].keys())

dict_keys(['created_at', 'id', 'id_str', 'text', 'display_text_range', 'source', 'truncated', 'in_reply_to_status_id', 'in_reply_to_status_id_str', 'in_reply_to_user_id', 'in_reply_to_user_id_str', 'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place', 'contributors', 'is_quote_status', 'quote_count', 'reply_count', 'retweet_count', 'favorite_count', 'entities', 'extended_entities', 'favorited', 'retweeted', 'possibly_sensitive', 'filter_level', 'lang', 'timestamp_ms'])


# 2) Extract the actual Text fields from the tweet JSONs

In [4]:
# Reading the full-text of JSON lines when it is in truncated format
truncated_data = []
for item in data:
    if item['truncated'] == True:
        truncated_data.append(item['extended_tweet']['full_text'].lower().split('https:')[0])
    else:
        truncated_data.append(item['text'].lower().split('https:')[0])
    
print("Truncated_data length:",len(truncated_data))
print("Truncated_data type:",type(truncated_data))
print("First item type:",type(truncated_data[5463]))
print("First item:",truncated_data[5463])

Truncated_data length: 10000
Truncated_data type: <class 'list'>
First item type: <class 'str'>
First item: rt @oeasywayout: s/o to @mrmakeitcold for the trade💪💯 


In [5]:
# Create a data frame from the list of texts
df_text = pd.DataFrame(truncated_data,columns=["Text"])
df_text

Unnamed: 0,Text
0,check out my class in #granbluefantasy!
1,extending a big thank you to our community par...
2,blueberry 🍨
3,rt @liluzivert: bad day ☹️®️
4,@prologve_ @bts_army @bts_twt i'm chim tho
...,...
9995,rt @chicodustyy: y'all try to cancel cardi eve...
9996,"#russiagatecloses will they flee? or, stand t..."
9997,i am really interested in the small ring of pi...
9998,rt @mahraalhosani: no one will ever care like ...


In [6]:
# Text segmentation with Naive method 1: Split from whitespace characters
text = df_text.iloc[0].to_string()
tokenized_text = text.split() # split(): Return a list of the words in the string, using whitespace as the delimiter string.
for w in tokenized_text:
    print(w)

Text
check
out
my
class
in
#granbluefantasy!


In [7]:
# Text segmentation with Naive method 2: Split from whitespace characters, take into account punctuation
import re

text = df_text.iloc[0].to_string()
tokenized = re.sub(r'([.,!?]+)',r' \1',text) # replace . , ! ? with whitespace+character(s), '+' means one or more
tokenized = re.sub(r"(n't)",r" \1",tokenized) # clitics
print(tokenized) # Note: this is still string, apply simple whitespace splitting to get a list of tokens

Text    check out my class in #granbluefantasy ! 


# 3) Segment each Tweet

In [8]:
#!pip3 install ufal.udpipe
import ufal.udpipe as udpipe

model = udpipe.Model.load("intro-to-nlp/en.segmenter.udpipe")
# horizontal: returns one sentence per line, with words separated by a single space
pipeline = udpipe.Pipeline(model,"tokenize","none","none","horizontal") 

segmented_document = pipeline.process(truncated_data[-1])

print(segmented_document)

rt @ diosaslesbianas : girl beautiful tits sex 96 @ adultbrazil @ boogie_1969
@sexx_freak @leono77
@ vdsxx1 @ kaifel30 @ 1688aw @europstars
@ swo 22 …



# 4) Count a Word Frequency List

In [9]:
from collections import Counter

token_counter = Counter()
for text in truncated_data: # Tweets
    tokenized = pipeline.process(text)
    tokens = tokenized.split() # after segmenter, we can do whitespace splitting
    token_counter.update(tokens)

print("Most common tokens:",token_counter.most_common(20))
print("Vocabulary size:",len(token_counter))

Most common tokens: [('@', 7482), (':', 6620), ('rt', 5795), ('.', 3919), ('the', 3654), (',', 3028), ('to', 2884), ('#', 2705), ('i', 2612), ('a', 2577), ('you', 2238), ('and', 2160), ('…', 1740), ('of', 1678), ('is', 1606), ('for', 1603), ('in', 1597), ('it', 1287), ('!', 1199), ('-', 1146)]
Vocabulary size: 26513


## Stop Words

In [10]:
import nltk
nltk.download('stopwords') # download the stopwords dataset
from nltk.corpus import stopwords

# take 150 most common words from the Tweets corpus and filter out stop words and punctuation
filtered_tokens = []
punctuation_chars = '. , : ( ) ! ? " = & - ; ... \\ '.split() # list of punctuation symbols to ignore
for word,count in token_counter.most_common(150):
    if word.lower() in stopwords.words("english") or word in punctuation_chars:
        continue
    filtered_tokens.append((word,count))
print("Number of tokens:",len(filtered_tokens))
print("Tokens:",filtered_tokens)

Number of tokens: 67
Tokens: [('@', 7482), ('rt', 5795), ('#', 2705), ('…', 1740), ("'s", 803), ('’s', 557), ('christmas', 528), ("n't", 483), ('like', 452), ('n’t', 408), ('one', 387), ('amp', 382), ('love', 374), ('new', 348), ('people', 337), ('get', 309), ('year', 307), ('day', 300), ('“', 280), ("'", 277), ('2017', 254), ('today', 254), ('time', 252), ('good', 250), ('1', 247), ('”', 233), ('got', 218), ('see', 215), ('know', 202), ('back', 198), ('want', 198), ('best', 194), ('need', 192), ("'m", 191), ('2', 188), ('happy', 187), ('family', 187), ('2018', 185), ('u', 183), ('make', 179), ('’m', 173), ('go', 170), ('life', 169), ('thank', 168), ('!!', 164), ('much', 163), ('bts', 162), ('would', 161), ('/', 160), ('merry', 159), ('first', 157), ('follow', 156), ('us', 154), ('great', 152), ('trump', 151), ('really', 151), ('ca', 145), ('right', 144), ("'re", 144), ('even', 141), ('everyone', 141), ('think', 140), ('could', 139), ('let', 139), ('..', 136), ('someone', 134), ('ever'

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# 5) Calculate *tf-idf*

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = truncated_data

vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

print(vectorizer.get_feature_names())
print(vectorizer.get_stop_words())
print(X.shape)
print(X)

None
(10000, 24153)
  (0, 9065)	0.6216331943318565
  (0, 10508)	0.20885975310695637
  (0, 4541)	0.4729753444297734
  (0, 14600)	0.24142943088219096
  (0, 15709)	0.3149429821551814
  (0, 4252)	0.4344730678575065
  (1, 23510)	0.2717316138348706
  (1, 21248)	0.11419345504068777
  (1, 15735)	0.24908785348054455
  (1, 1345)	0.1877869219967731
  (1, 15973)	0.45083724046434337
  (1, 4827)	0.38655100173009477
  (1, 15703)	0.24279643885618404
  (1, 21670)	0.12172915342983122
  (1, 23836)	0.1366353675851095
  (1, 21218)	0.2528622173415897
  (1, 2863)	0.2915729214046298
  (1, 7543)	0.4708989414614498
  (2, 3112)	1.0
  (3, 5734)	0.3811404136308844
  (3, 2316)	0.5150837670128887
  (3, 12733)	0.7566135153623667
  (3, 18289)	0.1302178422033334
  (4, 21486)	0.39211201327112183
  (4, 4341)	0.53376393362908
  :	:
  (9998, 3883)	0.33998699172087554
  (9998, 23343)	0.23566193159334076
  (9998, 15061)	0.23732170681017162
  (9998, 7380)	0.28342506937454826
  (9998, 21231)	0.18404580861894826
  (9998, 12702)

# 6) Similar Tweets

In [12]:
import os
import re
import binascii
from time import time

In [13]:
def get_shingles(truncated_data,k=10):
    """Get all shingles from requested file (hashes of these shingles)
    """
    for line in truncated_data:
        tweets = ' '.join([line[:-1].strip() for line in truncated_data])
        L = len(tweets)
        shingles = set()  # we use a set to automatically eliminate duplicates
        for i in range(L-k+1):
            shingle = tweets[i:i+k]
            crc = binascii.crc32(shingle.encode("utf-8")) #& 0xffffffff  # hash the shingle to a 32-bit integer
            shingles.add(crc)
        return list(shingles)

In [14]:
shingles = get_shingles(truncated_data,k=5)
shingles

[222298115,
 720896004,
 1497366533,
 3554148356,
 908066827,
 2528116752,
 1968701460,
 1387790357,
 555745301,
 623378455,
 704118805,
 3462397983,
 3880255531,
 4187488301,
 1680343096,
 199229500,
 890765374,
 4175429695,
 3759669312,
 1633681473,
 2854748227,
 1946157123,
 3251109960,
 4290248777,
 1528823883,
 646971469,
 4002938960,
 389546064,
 1949827153,
 1033896020,
 1649410136,
 2722627676,
 3890741342,
 2914517086,
 3940548705,
 2563244132,
 1197998181,
 1032323177,
 3038249069,
 3778543728,
 2579497074,
 2696413303,
 4264034423,
 1075314810,
 245366906,
 2651324542,
 306708612,
 1409810564,
 3628073099,
 318767245,
 3872915598,
 3454009489,
 1308098707,
 1045430421,
 3733979293,
 3268935841,
 2260205730,
 3638558883,
 2198864036,
 2071462051,
 957874340,
 443023528,
 1573034,
 4183818414,
 3292528815,
 2454716597,
 520618171,
 969932988,
 1551368388,
 710410438,
 316145862,
 2457338054,
 935854283,
 541065423,
 1286078672,
 832569555,
 4195352788,
 4289200341,
 63963351,


## using minshahing to find similar tweets

In [15]:
# set global parameters to process the whole dataset
bands = 5
rows = 5
nsig = bands*rows  # number of elements in signature, or the number of different random hash functions
maxShingleID = 2**32-1  # record the maximum shingle ID that we assigned
nextPrime = 4294967311  # next prime number after maxShingleID
A = np.random.randint(0,nextPrime,size=(nsig,))
B = np.random.randint(0,nextPrime,size=(nsig,))

def minhash_vectorized(shingles,A,B,nextPrime,maxShingleID,nsig):
    signature = np.ones((nsig,))*(maxShingleID+1)
    for ShingleID in shingles:
        hashCodes = ((A*ShingleID+B) % nextPrime) % maxShingleID
        np.minimum(signature, hashCodes, out=signature)
    return signature

In [None]:

t = time()
signatures_all_files_1 = []
for sh in get_shingles(truncated_data, k=5):
    signature = minhash_vectorized(get_shingles(truncated_data, k=5),A,B, nextPrime, maxShingleID, nsig)
    signatures_all_files_1.append(signature)
t1 = time()-t
print("slow code took {} seconds".format(t1))
t = time()
signatures_all_files_2 = []
for sh in get_shingles(truncated_data, k=5):
    signature = minhash_vectorized(sh, A, B, nextPrime, maxShingleID, nsig)
    signatures_all_files_2.append(signature)
t2 = time()-t
print("slow code took {} seconds".format(t2))
print('speedup {}'.format(t1/t2))
signatures_all_files_1 = np.array(signatures_all_files_1)
signatures_all_files_2 = np.array(signatures_all_files_2)
print("results are the same: {}".format(np.allclose(signatures_all_files_1, signatures_all_files_2)))