In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
from nltk.corpus import stopwords
import nltk
import re
import string

In [3]:
train = pd.read_csv('../input/tweet-sentiment-extraction/train.csv')
test = pd.read_csv('../input/tweet-sentiment-extraction/test.csv')

print('Training data shape: ', train.shape)
train.info()
print('Testing data shape: ', test.shape)

Training data shape:  (27486, 4)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27486 entries, 0 to 27485
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   textID         27486 non-null  object
 1   text           27485 non-null  object
 2   selected_text  27485 non-null  object
 3   sentiment      27486 non-null  object
dtypes: object(4)
memory usage: 429.5+ KB
Testing data shape:  (3535, 3)


## Metric

Higher the score, the more similar the two words.

In [None]:
def jaccard(str1, str2): 
    '''
    Intersection of words in the sentence / Union of all words
    '''
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

str1 = "Let us compare the two strings"
str2 = "Let us compare the two strings"
str3 = "Let us compare some random strings"

print(jaccard(str1, str2))

print(jaccard(str1, str3))

## Basic EDA

In [6]:
print(train.isnull().sum(),'\n')
print(test.isnull().sum())

textID           0
text             1
selected_text    1
sentiment        0
dtype: int64 

textID       0
text         0
sentiment    0
dtype: int64


In [7]:
# Dropping missing values in training
train.dropna(axis=0, how="any", inplace=True)

Head values of all the sentiments

In [8]:
for sentiment in ["positive", "negative", "neutral"] :
    print(sentiment.center(25,'~'))
    print(train.query(f'sentiment == "{sentiment}"')['text'].head())
    print()

~~~~~~~~~positive~~~~~~~~
1        Oh! Good idea about putting them on ice cream
4               haha better drunken tweeting you mean?
6    had an awsome salad! I recommend getting the S...
7     fine! Going to do my big walk today 20 or so ...
8          Thank a yoou  how are you? #TwitterTakeover
Name: text, dtype: object

~~~~~~~~~negative~~~~~~~~
3            i dont think you can vote anymore! i tried
5                          headache  wanna see my Julie
12                                     Miss you my dear
15    Today Dan bought me Bio Dome AND the Reality B...
16                                oo noo thats not good
Name: text, dtype: object

~~~~~~~~~neutral~~~~~~~~~
0     Spent the entire morning in a meeting w/ a ven...
2     says good (or should i say bad?) afternoon!  h...
9     Why don't adobe realise no one WANTS to pay fo...
10                      PRD take a long time to review!
11    _2008 Well, having to revise them!  Was to do ...
Name: text, dtype: object



## Sentiment distribution 

In [9]:
train.sentiment.value_counts(normalize=True)*100

neutral     40.447517
positive    31.224304
negative    28.328179
Name: sentiment, dtype: float64

## Text data preprocessing

1. Make text lowercase
- removes hyperlinks
- remove punctuation
- removes numbers
- tokenizes
- removes stopwords

In [10]:
def clean_text(text):
    text = text.lower()

    # Hyperlinks / Text in 
    text = re.sub()
    text = re.sub('https?://\S+|www\.\S+', '', text)
    
    # Punctuations
    text = re.sub('<.*?>+', '', text)
    # re.escape makes sure special characters are escaped
    text = re.sub('[%s]' % re.escape(string.punctuation), '', text)
    
    # Remove new line
    text = re.sub('\n', '', text)
    # Removes alpha numeric and numbers in the text
    text = re.sub('\w*\d\w*', '', text)
    return text


def text_preprocessing(text):
    """
        Cleaning and parsing the text.
    """
    tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
    nopunc = clean_text(text)
    tokenized_text = tokenizer.tokenize(nopunc)
    #remove_stopwords = [w for w in tokenized_text if w not in stopwords.words('english')]
    combined_text = ' '.join(tokenized_text)
    return combined_text

## Analyzing Text Statistics
We can now do some statistical analysis to explore the fundamental characteristics of the text data. Some of the analysis which can be useful are:

1. Text length analysis
2. Word frequency analysis

In [11]:
train['tweet_length'] = train['text'].apply(len)
train['word_frequency'] = train['text'].str.split().apply(len)

In [17]:
# train.loc[:, ['tweet_length', 'word_frequency']].describe()

train[['tweet_length', 'word_frequency']].describe()

Unnamed: 0,tweet_length,word_frequency
count,27485.0,27485.0
mean,68.727779,12.903693
std,35.963657,6.926507
min,3.0,1.0
25%,39.0,7.0
50%,64.0,12.0
75%,97.0,18.0
max,165.0,33.0


## Extracting the sentiment terms

## Output

## References

1. https://www.kaggle.com/parulpandey/basic-preprocessing-and-eda
2. 