In [0]:
%scala
val filepath1= "abfss://<Your ADLS Blob>.dfs.core.windows.net/mldata/Twitter_NLP/train_tweets.csv"
var df1=spark.read.format("csv").option("header", "true").option("delimiter", ",").load(filepath1)
df1.createOrReplaceTempView("train_twitter")

val filepath2= "abfss://<Your ADLS Blob>.dfs.core.windows.net/mldata/Twitter_NLP/test_tweets.csv"
var df2=spark.read.format("csv").option("header", "true").option("delimiter", ",").load(filepath2)
df2.createOrReplaceTempView("test_twitter")



In [0]:
train_twitter= spark.sql("""select * from train_twitter""")
test_twitter= spark.sql("""select * from test_twitter""")

train_twitter = train_twitter.toPandas()
outdir = '/dbfs/FileStore/train_twitter.csv'
train_twitter.to_csv(outdir, index=False)

test_twitter = test_twitter.toPandas()
outdir = '/dbfs/FileStore/test_twitter.csv'
test_twitter.to_csv(outdir, index=False)

In [0]:
%pip install nltk
%pip install textblob

In [0]:
import warnings
warnings.filterwarnings('ignore')

In [0]:
import pandas as pd
import numpy as np
train = pd.read_csv("/dbfs/FileStore/train_twitter.csv", header='infer')
test = pd.read_csv("/dbfs/FileStore/test_twitter.csv", header='infer')
text_col='tweet'
df=train
############################################


print('\n\n****************TEXT DATA****************\n\n')
print(df[text_col].head())

print('\n\n1. BASIC FEATURE EXTRACTION\n\n')

##Number of Words (Intution- generally the negative sentiments contain a lesser amount of words than the positive ones)
print('\n\n****************WORD COUNT****************\n\n')
train['word_count'] = df[text_col].apply(lambda x: len(str(x).split(" ")))
print(df[[text_col,'word_count']].head())

##Number of characters (This is done by calculating the length of the text, includes spaces)
print('\n\n****************NUMBER OF CHARACTERS****************\n\n')
df['char_count'] = df[text_col].str.len() 
print(df[[text_col,'char_count']].head())

##Average Word Length( sum of the length of all the words and divide it by the total length of the text or total word count in text)
print('\n\n****************AVERAGE WORD LENGTH****************\n\n')
df['avg_word'] = df[text_col].apply(lambda x: np.mean([len(w) for w in x.split(" ")]))
print(df[[text_col,'avg_word']].head())

##Count of Special Charachters/ Numbers (str.isalpha() method is used to check if all characters in each string in series are alphabetic(a-z/A-Z))
print('\n\n****************SPECIAL CHARACHTERS COUNT****************\n\n')
df["special_char"] = df[text_col].apply(lambda p: sum( not q.isalpha() for q in p ))
df['numerics'] = df[text_col].apply(lambda x: len([x for x in x.split() if x.isdigit()]))
print(df[[text_col,'special_char','numerics']].head())

##Number of Upper case words (Anger/ extreme emotions in text are often upper case)
print('\n\n****************UPPER CASE WORDS COUNT****************\n\n')
df['upper'] = df[text_col].apply(lambda x: len([x for x in x.split() if x.isupper()]))
print(df[[text_col,'upper']].head())





In [0]:
#import pandas as pd
#import numpy as np
#train = pd.read_csv("/dbfs/FileStore/train_twitter.csv", header='infer')
#test = pd.read_csv("/dbfs/FileStore/test_twitter.csv", header='infer')
#text_col='tweet'
#df=train
############################################


print('\n\n****************TEXT DATA****************\n\n')
print(df[text_col].head())

print('\n\n 2. BASIC PRE-PROCESSING\n\n')

##Structural standardisation (Lower case every word as 'INDIA', 'India' treated differently)
#x = "#".join(myList)-->x returns the List elements as string separated by ''#''
print('\n\n****************CONVERT TO LOWER CASE****************\n\n')
df[text_col] = df[text_col].apply(lambda x: " ".join(x.lower() for x in x.split()))  
print(df[text_col].head())

##Remove Punctuations (as it doesn’t add any extra information while treating text data)
#\w=[a-zA-Z0-9_], \s=Unicode whitespace characters (which includes [\t\n\r\f\v]
print('\n\n****************REMOVE PUNCTUATIONS****************\n\n')
df[text_col] = df[text_col].str.replace('[^\w\s]','')
print(df[text_col].head())

##Stop words (or commonly occurring English words) removal (add no extra information to text data)
print('\n\n****************STOP WORDS REMOVAL****************\n\n')
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
stop = stopwords.words('english')
df[text_col] = df[text_col].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
print(df[text_col].head())

##Most frequent words appearing throughout corpus removal (as their presence will not of any use in classification of our text data)
#Get top 10 most frequent words
print('\n\n****************MOST FREQUENT WORDS REMOVAL****************\n\n')
freq = pd.Series(' '.join(df[text_col]).split()).value_counts()[:10]
freq = list(freq.index)
print("Most Frequent Words: ",freq)
train[text_col] = train[text_col].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
print(train[text_col].head())

##Rare Words removal (Because they’re so rare, the association between them and other words is dominated by noise)
#Get top 10 least frequent words
print('\n\n****************MOST RARE WORDS REMOVAL****************\n\n')
freq = pd.Series(' '.join(train[text_col]).split()).value_counts()[-10:]
freq = list(freq.index)
print("Least Frequent Words: ",freq)
train[text_col] = train[text_col].apply(lambda x: " ".join(x for x in x.split() if x not in freq))
print(train[text_col].head())

## Spelling correction (this also will help us in reducing multiple copies of same words and treating them differently)
#Take a lot of time to make these corrections. Limit operation to first five rows for testing
#We should also keep in mind that words are often used in their abbreviated form. For instance, ‘your’ is used as ‘ur’. We should treat this before the spelling correction step, otherwise these words might be transformed into any other word like the one shown below: 'ur' used for 'your' --Changed to-->'or'
print('\n\n****************SPELLING CORRECTION****************\n\n')
from textblob import TextBlob
train[text_col][:5] = train[text_col][:5].apply(lambda x: " ".join(str(TextBlob(x).correct()) for x in x.split())) 
print(train[text_col].head())

##Stemming (removal of suffices, like “ing”, “ly”, “s” etc to get the base word out of different forms of the same word)
#Take a lot of time to make these corrections. Limit operation to first five rows for testing
print('\n\n****************STEMMING****************\n\n')
from nltk.stem import PorterStemmer
st = PorterStemmer()
train[text_col][:5] = train[text_col][:5].apply(lambda x: " ".join(st.stem(word) for word in x.split()))
print(train[text_col].head())

##Lemmatization (It is a more effective option than stemming because it converts the word into its root word, rather than just stripping the suffices)
print('\n\n****************LEMMATISATION****************\n\n')
from textblob import Word
nltk.download('wordnet')
train[text_col] = train[text_col].apply(lambda x: " ".join(Word(word).lemmatize() for word in x.split()))
print(train[text_col].head())

##N-Grams Identification
print('\n\n****************N-Grams****************\n\n')
#Get Top 10 N-Grams
text=' '.join(df[text_col].values)
from nltk.collocations import *
tokens = nltk.word_tokenize(text)
bigram_measures = nltk.collocations.BigramAssocMeasures()
finder = BigramCollocationFinder.from_words(tokens)
Top_bigrams=sorted(finder.ngram_fd.items(),key=lambda x: x[1],reverse=True)
print("Top 10 N-Grams :",Top_bigrams[:10])

##Sentiment Analysis
#Take a lot of time to make these corrections. Limit operation to first five rows for testing
#Returns a tuple representing polarity and subjectivity of each tweet. Here, we only extract polarity as it indicates the sentiment as value nearer to 1 means a positive sentiment and values nearer to -1 means a negative sentiment. This can also work as a feature for building a machine learning model.
print('\n\n****************SENTIMENT ANALYSIS****************\n\n')
df['sentiment']=""
df['sentiment'][:5] = df[text_col][:5].apply(lambda x: TextBlob(x).sentiment[0])
print(df[[text_col,'sentiment']].head())


############################################
print('\n\n****************COMBINED COLUMNS****************\n\n')
##Combine all columns together to feed into vectorizer
cols = [text_col,'word_count','char_count','avg_word','special_char','numerics','upper','sentiment']
df['combined'] = df[cols].apply(lambda row: ' '.join(row.values.astype(str)), axis=1)
print(df['combined'].head())

##TF-IDF Scoring
#Use this vector to get cosine similarity between the text rows
print('\n\n****************TF-IDF VECTORIZE****************\n\n')
nltk.download('punkt')
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word' ,stop_words= 'english' ,ngram_range=(1,1))
train_vect = tfidf.fit_transform(df['combined'])
tfidf_df=pd.DataFrame(train_vect.toarray(), index= df.index.to_list())
print("Shape of Text Converted to vector after adding tf-idf scores :",tfidf_df.shape)


In [0]:
"""
#Get cosine similarity between two texts
Step1. Convert the text into vector of numbers (Using TF-IDF scores)
       a)TF= Frequency of a word in the given sentence or Term-Frequency
       b)IDF=Inverse Doc Frequency is 1/ number of times a word appears accross all documents. This is important because some words like is/am/are/the are present throughout the text and add no value/variability when present in a sentence. So allot these words a lower score by taking the inverse. We can ignore the IDF score as we have removed the stop words and most frequent words accross.
       c)TF-IDF score =TF score * IDF score
       d)text_to_vector function returns a tuple of { word: Frequency } or TF score. Thus converts a text to vector.
       
Step2.Calculate cosine similarity of the two vectors
      a)cos_sim(vectA,vectB)=dot product=(xa.xb + ya.yb + za.zb)/[(sqrt(xa.xa + ya.ya + za.za)).(sqrt(xb.xb + yb.yb + zb.zb))]
      where vectA=(xa,ya,za) ; vectB=(xb,yb,zb)
"""
import math
import re
from collections import Counter

WORD = re.compile(r"\w+") 
def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

#######################################################CALLING#############################################################
print("Text 1 :",df['combined'][0])
print("Text 2 :",df['combined'][1])
print("Text 3 :",df['combined'][2])

text1 = df['combined'][0]
text2 = df['combined'][1]
text3 = df['combined'][2]

vector1 = text_to_vector(text1)
vector2 = text_to_vector(text2)
vector3 = text_to_vector(text3)

cosine12 = get_cosine(vector1, vector2)
print("Cosine Similarity between String 1 & String 2:", cosine12)

cosine13 = get_cosine(vector1, vector3)
print("Cosine Similarity between String 1 & String 3:", cosine13)

#More the cosine similarity the closer the two strings are