In [56]:
import json # JSON encoder and decoder: store python data structures (e.g. lists and dictionaries) as text files
import pandas as pd

In [2]:
tweets = []
for line in open(r"C:/Users/shohidul/Desktop/englishtweetssample.json", "rt",encoding="utf-8"):
    tweets.append(json.loads(line))

In [3]:
#tweets

In [4]:
print("Number of documents:", len(tweets))

documents = [document["text"] for document in tweets] # right now we only need the text field for each document
print(len(documents))
print(documents[:7])

Number of documents: 10000
10000
['Check out my class in #GranblueFantasy! https://t.co/pAvXn8diJr', 'Extending a big Thank You to our Community Partner all over the world! https://t.co/cu7on7g1si', 'Blueberry 🍨 https://t.co/2gzHAFWYJY', 'RT @LILUZIVERT: Bad day ☹️®️', "@prologve_ @BTS_ARMY @BTS_twt I'm Chim tho", 'i need a dog to cuddle with right now', 'RT: Country Inn countryinns #CampSprings 🏨 👉🚖 For Taxi 📞703-445-4450 https://t.co/lXdFUm4qUb']


In [5]:
import ufal.udpipe as udpipe

model = udpipe.Model.load(r"C:/Users/shohidul/Desktop/en.segmenter.udpipe")
pipeline = udpipe.Pipeline(model,"tokenize","none","none","horizontal") # horizontal: returns one sentence per line, with words separated by a single space
segmented_document = pipeline.process(documents[0])

print(segmented_document)

Check out my class in # GranblueFantasy !
https://t.co/pAvXn8diJr



In [6]:
from collections import Counter

token_counter = Counter()
for doc in documents[:1000]: # IMDB documents
    tokenized = pipeline.process(doc)
    tokens = tokenized.split() # after segmenter, we can do whitespace splitting
    token_counter.update(tokens)

print("Most common tokens:", token_counter.most_common(20))
print("Vocabulary size:", len(token_counter))

Most common tokens: [('@', 717), (':', 716), ('RT', 612), ('.', 360), ('the', 281), ('#', 280), (',', 247), ('…', 244), ('a', 241), ('to', 228), ('I', 203), ('and', 192), ('you', 178), ('of', 152), ('in', 150), ('for', 137), ('is', 137), ('-', 123), ('!', 108), ('on', 97)]
Vocabulary size: 6078


In [7]:
import nltk
nltk.download('stopwords') # download the stopwords dataset

from nltk.corpus import stopwords

# take 150 most common words from the IMDB corpus and filter out stop words and punctuation
filtered_tokens = []
punctuation_chars = '. , : ( ) ! ? " = & - ; ... \\ '.split() # list of punctuation symbols to ignore
for word, count in token_counter.most_common(50):
    if word.lower() in stopwords.words("english") or word in punctuation_chars:
        continue
    filtered_tokens.append((word, count))
print("Number of tokens:", len(filtered_tokens))
print("Tokens:", filtered_tokens)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shohidul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Number of tokens: 13
Tokens: [('@', 717), ('RT', 612), ('#', 280), ('…', 244), ("'s", 79), ('’s', 53), ('Christmas', 44), ('n’t', 40), ('people', 40), ("n't", 40), ('•', 39), ('one', 38), ('amp', 38)]


In [9]:
documents[0]

'Check out my class in #GranblueFantasy! https://t.co/pAvXn8diJr'

In [41]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import numpy as np
import pandas as pd
import re

In [44]:
Doc = list()
with open(r"C:/Users/shohidul/Desktop/englishtweetssample.json", "rt",encoding="utf-8") as file:
    for line in file:
        for l in re.split(r"\.\s|\?\s|\!\s|\n",line):
            if l:
                Doc.append(l)

In [45]:
CV = CountVectorizer(stop_words='english', min_df=3, max_df=0.5, ngram_range=(1,2))
CvTransform = CV.fit_transform(Doc)

In [47]:
CvTransform

<33244x112458 sparse matrix of type '<class 'numpy.int64'>'
	with 4888238 stored elements in Compressed Sparse Row format>

In [48]:
transformer = TfidfTransformer()
transformed_weights = transformer.fit_transform(CvTransform)
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist()
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights})

In [49]:
weights_df.sort_values(by='weight', ascending=False).head(10)

Unnamed: 0,term,weight
98572,twimg com,0.048676
98571,twimg,0.048676
78785,pbs,0.038264
78786,pbs twimg,0.038257
66634,jpg,0.025646
59931,https pbs,0.023213
84711,resize,0.022151
98022,true,0.021858
98607,twitter,0.021143
98615,twitter com,0.020592
