<a href="https://colab.research.google.com/github/naveenk5199/nlp_practice/blob/main/Identify_the_sentiments.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Ingestion


In [None]:
import pandas as pd
drive_path = '/content/drive/MyDrive/Colab Notebooks/NLP/Identify-the-sentiments/'
df = pd.read_csv(drive_path  + 'data/train_2kmZucJ.csv')
df.shape

(7920, 3)

In [None]:
df.head(3)

Unnamed: 0,id,label,tweet
0,1,0,#fingerprint #Pregnancy Test https://goo.gl/h1...
1,2,0,Finally a transparant silicon case ^^ Thanks t...
2,3,0,We love this! Would you go? #talk #makememorie...


# Preprocessing


Tokenisation, Stopwords removal & Lemmatisation

In [None]:
import nltk

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
from nltk.corpus import stopwords

In [None]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [None]:
from nltk.stem import WordNetLemmatizer

In [None]:
# lemmatizer initiation 
wordnet_lemmatizer = WordNetLemmatizer()

In [None]:
import re, numpy as np

In [None]:
#token = nltk.word_tokenize(df)

# Set of stopwords
stopwords_set = (set(stopwords.words('english')))

def normalize_document(doc):
  corpus = []
  #for tweet in doc.tweet:
    
  # lower case and remove special characters\whitespaces
  doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I|re.A)
  doc = doc.lower()
  doc = doc.strip()

  # Tokenisation
  tokens = nltk.word_tokenize(doc)

  # Filtered tokens
  filtered_tokens = []

  # Stopwords filter
  for token in tokens:
    if token not in stopwords_set:
      
      # Lemmatisation
      token1 = wordnet_lemmatizer.lemmatize(token, pos = "n")
      token2 = wordnet_lemmatizer.lemmatize(token1, pos = "v")
      token3 = wordnet_lemmatizer.lemmatize(token, pos = "a")

      filtered_tokens.append(token3)
      
  doc = ' '.join(filtered_tokens)

  return doc
  

In [None]:
normalize_corpus = np.vectorize(normalize_document)

In [None]:
tweet_corpus = df.tweet
print(tweet_corpus)
type(tweet_corpus.tolist())

0       #fingerprint #Pregnancy Test https://goo.gl/h1...
1       Finally a transparant silicon case ^^ Thanks t...
2       We love this! Would you go? #talk #makememorie...
3       I'm wired I know I'm George I was made that wa...
4       What amazing service! Apple won't even talk to...
                              ...                        
7915    Live out loud #lol #liveoutloud #selfie #smile...
7916    We would like to wish you an amazing day! Make...
7917    Helping my lovely 90 year old neighbor with he...
7918    Finally got my #smart #pocket #wifi stay conne...
7919    Apple Barcelona!!! #Apple #Store #BCN #Barcelo...
Name: tweet, Length: 7920, dtype: object


list

In [None]:
tweet_norm_corpus = normalize_corpus(tweet_corpus)

In [None]:
tweet_norm_corpus[:3]

array(['fingerprint pregnancy test httpsgooglhmfqv android apps beautiful cute health igers iphoneonly iphonesia iphone',
       'finally transparant silicon case thanks uncle yay sony xperia sonyexperias httpinstagramcompygetjcjm',
       'love would go talk makememories unplug relax iphone smartphone wifi connect httpfbmenlsupcu'],
      dtype='<U329')

In [None]:
tweet_norm_corpus[1]

'finally transparant silicon case thanks uncle yay sony xperia sonyexperias httpinstagramcompygetjcjm'

In [None]:
type(tweet_norm_corpus)

numpy.ndarray

# Feature Engineering

Vectorisation

TFIDF Vectoriser

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
tfidf_vectorizer = TfidfVectorizer(min_df = 0., max_df = 1., use_idf = True)

In [None]:
tweet_tfidf_vectorizer_matrix = tfidf_vectorizer.fit_transform(tweet_norm_corpus)
tweet_tfidf_vectorizer_matrix = tweet_tfidf_vectorizer_matrix.toarray()

tweet_vocab = tfidf_vectorizer.get_feature_names()
pd.DataFrame(np.round(tweet_tfidf_vectorizer_matrix, 2), columns = tweet_vocab)



Unnamed: 0,aa,aaaahhhhhhh,aah,aalborg,aand,aapl,aarhus,aaron,aaronbrandt,aaronskip,...,zpictwittercomgskxbmkrto,zpictwittercomhnzltjvn,zpictwittercomnlmqvrxzv,zpictwittercomsschughesb,zsofimonster,zumies,zune,zunehd,zurich,zx
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7915,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7916,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
from sklearn.decomposition import LatentDirichletAllocation

lda = LatentDirichletAllocation(n_components=3, max_iter=10000, random_state=0)
dt_matrix = lda.fit_transform(tweet_tfidf_vectorizer_matrix)
features = pd.DataFrame(dt_matrix, columns=['T1', 'T2', 'T3'])
features