In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
data = pd.read_csv('C:/Users/HP/sentiment analysis/chatgpt_reviews.csv')

In [3]:
data.shape

(2292, 2)

In [4]:
data.head()

Unnamed: 0,review,rating
0,Up to this point I’ve mostly been using ChatGP...,positif
1,I’ve been a user since it’s initial roll out a...,positif
2,This app would almost be perfect if it wasn’t ...,positif
3,"I recently downloaded the app and overall, it'...",positif
4,I appreciate the devs implementing Siri suppor...,positif


# Data Preprocessing

In [5]:
data.duplicated().sum()

32

In [6]:
# menghapus duplikat data
data = data.drop_duplicates()

In [7]:
data.isnull().sum()

review    0
rating    0
dtype: int64

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2260 entries, 0 to 2291
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   review  2260 non-null   object
 1   rating  2260 non-null   object
dtypes: object(2)
memory usage: 53.0+ KB


In [9]:
data.describe()

Unnamed: 0,review,rating
count,2260,2260
unique,2257,3
top,Nice,positif
freq,2,1407


In [10]:
# Text preprocess
# 1. Convert Uppercase to Lowercase
# 2. Remove links
# 3. Remove Punctuations
# 4. Remove Number
# 5. Remove Stopwords
# 6. Stemming

In [11]:
import re
import string

In [12]:
#Convert uppercase to lowercase

In [13]:
data['review'] = data['review'].apply(lambda x:' '.join(x.lower() for x in x.split()))

In [14]:
data['review'].head()

0    up to this point i’ve mostly been using chatgp...
1    i’ve been a user since it’s initial roll out a...
2    this app would almost be perfect if it wasn’t ...
3    i recently downloaded the app and overall, it'...
4    i appreciate the devs implementing siri suppor...
Name: review, dtype: object

In [15]:
# Remove Links

In [16]:
data['review'] = data['review'].apply(lambda x:' '.join(re.sub(r'^https?:\/\/.*[\r\n]*','',x,flags=re.MULTILINE) for x in x.split()))

In [17]:
data['review'].head()

0    up to this point i’ve mostly been using chatgp...
1    i’ve been a user since it’s initial roll out a...
2    this app would almost be perfect if it wasn’t ...
3    i recently downloaded the app and overall, it'...
4    i appreciate the devs implementing siri suppor...
Name: review, dtype: object

In [18]:
# Remove Punctuation

In [19]:
def remove_punctuations(text):
    for punctuation in string.punctuation:
        text=text.replace(punctuation,'')
    return text
data['review'] = data['review'].apply(remove_punctuations)

In [20]:
data['review'].head()

0    up to this point i’ve mostly been using chatgp...
1    i’ve been a user since it’s initial roll out a...
2    this app would almost be perfect if it wasn’t ...
3    i recently downloaded the app and overall its ...
4    i appreciate the devs implementing siri suppor...
Name: review, dtype: object

In [21]:
# Remove Numbers

In [22]:
data['review']=data['review'].str.replace('\d+','', regex=True)

In [23]:
#Remove Stopwords

In [24]:
import nltk

In [25]:
nltk.download('stopwords', download_dir='C:/Users/HP/sentiment analysis')

[nltk_data] Downloading package stopwords to C:/Users/HP/sentiment
[nltk_data]     analysis...
[nltk_data]   Package stopwords is already up-to-date!


True

In [26]:
with open('C:/Users/HP/sentiment analysis/corpora/stopwords/english','r') as file:
    sw=file.read().splitlines()

In [27]:
sw

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [28]:
data['review'] = data['review'].apply(lambda x:' '.join(x for x in x.split() if x not in sw))

In [29]:
#Stemming

In [30]:
from nltk.stem import PorterStemmer
ps=PorterStemmer()

In [31]:
data['review'] = data['review'].apply(lambda x:' '.join(ps.stem(x) for x in x.split()))

In [32]:
data['review'].head()

0    point i’v mostli use chatgpt window desktop us...
1    i’v user sinc it’ initi roll wait mobil applic...
2    app would almost perfect wasn’t one littl thin...
3    recent download app overal great platform exce...
4    appreci dev implement siri support—it alreadi ...
Name: review, dtype: object

In [33]:
# Building Vocabulary

In [34]:
from collections import Counter
vocabulary=Counter()

In [35]:
for sentence in data['review']:
    vocabulary.update(sentence.split())

In [36]:
len(vocabulary)

5286

In [37]:
tokens = [key for key in vocabulary if vocabulary[key]>10]

In [38]:
len(tokens)

739

In [39]:
data

Unnamed: 0,review,rating
0,point i’v mostli use chatgpt window desktop us...,positif
1,i’v user sinc it’ initi roll wait mobil applic...,positif
2,app would almost perfect wasn’t one littl thin...,positif
3,recent download app overal great platform exce...,positif
4,appreci dev implement siri support—it alreadi ...,positif
...,...,...
2287,begin someth crazi let y’all know i’v sinc day...,positif
2288,i’v use chat proud premium subscrib awhil help...,positif
2289,chatgpt io app outstand product seamless funct...,positif
2290,sam altman’ blatant attempt regulatori captur ...,negatif


In [40]:
data.to_csv('C:/Users/HP/sentiment analysis/review_chatgpt_clean.csv', sep=',')