In [None]:
import numpy as np
import pandas as pd
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import re
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from wordcloud import WordCloud
import seaborn as sns
import matplotlib.pyplot as plt
import cufflinks as cf 
%matplotlib inline
from plotly.offline import init_notebook_mode, iplot
init_notebook_mode(connected=True)
cf.go_offline();
import plotly.graph_objs as go
from plotly.subplots import make_subplots

import warnings
warnings.filterwarnings("ignore")
warnings.warn("this will not show")

pd.set_option('display.max_columns', None)


In [None]:
df = pd.read_csv("amazon.csv")

In [None]:
df.head()

In [None]:
df

In [None]:
df = df.sort_values("wilson_lower_bound", ascending = False)
# Check if the column 'unnamed: 0' exists in the DataFrame
if 'unnamed : 0' in df.columns:
    # Drop the column if it exists (with corrected column name)
    df.drop('unnamed : 0', inplace=True, axis=1)
else:
    # Print a message indicating that the column doesn't exist
    print("Column 'unnamed : 0' not found in the DataFrame")

# Display the first few rows of the modified DataFrame
df.head()


In [None]:
def missing_values_analysis(df):
    na_columns_ = [col for col in df.columns if df[col].isnull().sum() > 0]
    n_miss = df[na_columns_].isnull().sum().sort_values(ascending=True)
    ratio_ = (df[na_columns_].isnull().sum() / df.shape[0]*100).sort_values(ascending=True)
    missing_df = pd.concat([n_miss, np.round(ratio_,2)], axis=1, keys=['Missing Values', 'Ratio'])
    missing_df =pd.DataFrame(missing_df)
    return missing_df

def check_dataframe(df, head=5, tail=5):
    print("SHAPE".center(82, '~'))
    print('Rows: {}'.format(df.shape[0]))
    print('Columns: {}'.format(df.shape[1]))
    print("TYPES".center(82, '~'))
    print(df.dtypes)
    print("".center(82, '~'))
    print(missing_values_analysis(df))
    print('DUPLICATED VALUES'.center(83, '~'))
    print(df.duplicated().sum())
    print("QUANTILES".center(82, '~'))
    
    # Filter out non-numeric columns and exclude 'Unnamed: 0'
    numeric_columns = df.select_dtypes(include=['number']).columns
    numeric_columns = [col for col in numeric_columns if col != 'Unnamed: 0']
    print(df[numeric_columns].quantile([0, 0.5, 0.50, 0.95, 0.99, 1]).T)

# Assuming df is already defined
check_dataframe(df)



In [None]:
def check_class(dataframe):
    # Assuming "Unnamed" is not present in the index
    if 'Unnamed: 0' in dataframe.columns:
        dataframe = dataframe.drop('Unnamed: 0', axis=1)

    nunique_df = pd.DataFrame({'Variable': dataframe.columns,
                                'Classes': [len(pd.unique(dataframe[i]))
                                            for i in dataframe.columns]})
    
    nunique_df = nunique_df.sort_values('Classes', ascending=False).reset_index(drop=True)
    return nunique_df

check_class(df)



In [None]:
constraints = ['#B34D22', '#EBE00C', '#1FEB0C', '#0C92EB', '#EB0CD5']


def categorical_variable_summary(df, column_name):
    fig = make_subplots(rows=1, cols=2,
                       subplot_titles=('Countplot', 'Percentage'),
                       specs=[[{"type": "xy"}, {'type': 'domain'}]])

    fig.add_trace(go.Bar(y=df[column_name].value_counts().values.tolist(),
                         x=[str(i) for i in df[column_name].value_counts().index],
                         text=df[column_name].value_counts().values.tolist(),
                         textfont=dict(size=14),
                         name=column_name,
                         textposition='auto',
                         showlegend=False,
                         marker=dict(color=constraints, line=dict(color='#DBE6EC', width=1))),
                  row=1, col=1)

    fig.add_trace(go.Pie(labels=df[column_name].value_counts().keys(),
                         values=df[column_name].value_counts().values,
                         textfont=dict(size=18),
                         textposition='auto',
                         showlegend=False,
                         name=column_name,
                         marker=dict(colors=constraints)),
                  row=1, col=2)

    fig.update_layout(title={'text': column_name,
                             'y': 0.9,
                             'x': 0.5,
                             'xanchor': 'center',
                             'yanchor': 'top'},
                      template='plotly_white')

    iplot(fig)


In [None]:
categorical_variable_summary(df, 'overall')

In [None]:
df.reviewText.head()


In [None]:
review_example = df.reviewText[2031]
review_example

In [None]:
review_example = re.sub("[^a-zA-Z]",'',review_example)
review_example

In [None]:
# Assuming review_example contains the text you want to process
review_example = df.reviewText[2031]

# Split the text into words and convert to lowercase
processed_text = review_example.lower().split()

# Display the result
processed_text


In [None]:
rt = lambda x: re.sub("[^a-zA-Z]", ' ', str(x))
df["reviewText"] = df["reviewText"].map(rt)
df["reviewText"] = df["reviewText"]. str.lower()
df.head()

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Assuming you have the 'reviewText' column in your DataFrame
df[['polarity', 'subjectivity']] = df['reviewText'].apply(lambda Text: pd.Series(TextBlob(Text).sentiment))

for index, row in df['reviewText'].items():
    score = SentimentIntensityAnalyzer().polarity_scores(row)
    
    neg = score['neg']
    neu = score['neu']
    pos = score['pos']
    
    if neg > pos:
        df.loc[index, 'sentiment'] = "Negative"
    elif pos > neg:
        df.loc[index, 'sentiment'] = "Positive"
    else:
        df.loc[index, 'sentiment'] = "Neutral"




In [None]:
df[df['sentiment']=='positive'].sort_values("wilson_lower_bound",
                                           ascending= False).head(5)

In [None]:
categorical_variable_summary(df, 'sentiment')