In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns

## Data Review and Data Preparing

In [2]:
data = pd.read_csv('data/Sheet_1.csv', encoding="latin1")

In [3]:
data

Unnamed: 0,response_id,class,response_text,Unnamed: 3,Unnamed: 4,Unnamed: 5,Unnamed: 6,Unnamed: 7
0,response_1,not_flagged,I try and avoid this sort of conflict,,,,,
1,response_2,flagged,Had a friend open up to me about his mental ad...,,,,,
2,response_3,flagged,I saved a girl from suicide once. She was goin...,,,,,
3,response_4,not_flagged,i cant think of one really...i think i may hav...,,,,,
4,response_5,not_flagged,Only really one friend who doesn't fit into th...,,,,,
...,...,...,...,...,...,...,...,...
75,response_76,not_flagged,"Now that I've been through it, although i'm no...",,,,,
76,response_77,flagged,when my best friends mom past away from od'ing...,,,,,
77,response_78,not_flagged,As a camp counselor I provide stability in kid...,,,,,
78,response_79,flagged,My now girlfriend used to have serious addicti...,,,,,


In [4]:
data = pd.concat([data["class"],data["response_text"]], axis = 1)
# Sheet_1.drop(["response_id","Unnamed: 3","Unnamed: 4","Unnamed: 5","Unnamed: 6","Unnamed: 7"], axis = 1, inplace = True)

data.dropna(axis = 0, inplace=True)
data

Unnamed: 0,class,response_text
0,not_flagged,I try and avoid this sort of conflict
1,flagged,Had a friend open up to me about his mental ad...
2,flagged,I saved a girl from suicide once. She was goin...
3,not_flagged,i cant think of one really...i think i may hav...
4,not_flagged,Only really one friend who doesn't fit into th...
...,...,...
75,not_flagged,"Now that I've been through it, although i'm no..."
76,flagged,when my best friends mom past away from od'ing...
77,not_flagged,As a camp counselor I provide stability in kid...
78,flagged,My now girlfriend used to have serious addicti...


I will assign a numeric value to the data in the class column because the model has to be a numeric value during learning.

In [5]:
# not_flagged = 0
# flagged = 1
data['class']=data.loc[:, 'class'].map({'not_flagged':0, 'flagged':1})
data

Unnamed: 0,class,response_text
0,0,I try and avoid this sort of conflict
1,1,Had a friend open up to me about his mental ad...
2,1,I saved a girl from suicide once. She was goin...
3,0,i cant think of one really...i think i may hav...
4,0,Only really one friend who doesn't fit into th...
...,...,...
75,0,"Now that I've been through it, although i'm no..."
76,1,when my best friends mom past away from od'ing...
77,0,As a camp counselor I provide stability in kid...
78,1,My now girlfriend used to have serious addicti...


In [6]:
# information of data
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 80 entries, 0 to 79
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   class          80 non-null     int64 
 1   response_text  80 non-null     object
dtypes: int64(1), object(1)
memory usage: 1.9+ KB


In [7]:
# Dataset columns names
data.columns

Index(['class', 'response_text'], dtype='object')

## Natural Language Process (NLP)
### What is the Natural Language Process(NLP)?
#### Natural language processing (NLP) is a branch of artificial intelligence (AI) that enables computers to comprehend, generate, and manipulate human language.

In [8]:
data.head(10)

Unnamed: 0,class,response_text
0,0,I try and avoid this sort of conflict
1,1,Had a friend open up to me about his mental ad...
2,1,I saved a girl from suicide once. She was goin...
3,0,i cant think of one really...i think i may hav...
4,0,Only really one friend who doesn't fit into th...
5,0,a couple of years ago my friends was going to ...
6,1,Roommate when he was going through death and l...
7,1,i've had a couple of friends (you could say mo...
8,0,Listened to someone talk about relationship tr...
9,1,I will always listen. I comforted my sister wh...


### Regular expression operations 
#### In this section, I will clean the non-letter symbols with the Re library. (/, *, -, ',")

This is the fourth data in the column of response_text.

In [9]:
data.response_text[4]

'Only really one friend who doesn\'t fit into the any of the above categories. Her therapist calls it spiraling." Anyway she pretty much calls me any time she is frustrated by something with  her boyfriend to ask me if it\'s logical or not. Before they would just fight and he would call her crazy. Now she asks me if it\'s ok he didn\'t say "please" when he said  "hand me the remote."'

### Cleaning Data

In [10]:
import re
first_Res_text = data.response_text[4]
Res_text = re.sub("[^a-zA-Z]"," ", first_Res_text)
Res_text = Res_text.lower()

#### This data is the aspect after cleaning data.

In [11]:
Res_text

'only really one friend who doesn t fit into the any of the above categories  her therapist calls it spiraling   anyway she pretty much calls me any time she is frustrated by something with  her boyfriend to ask me if it s logical or not  before they would just fight and he would call her crazy  now she asks me if it s ok he didn t say  please  when he said   hand me the remote  '

#### NLTK 
##### In this chapter, I will get the words in the sentence.

In [12]:
import nltk
from nltk.corpus import stopwords

Res_text = nltk.word_tokenize(Res_text)
Res_text = [word for word in Res_text if not word in set(stopwords.words("english"))]

Words in the sentence.

In [13]:
Res_text

['really',
 'one',
 'friend',
 'fit',
 'categories',
 'therapist',
 'calls',
 'spiraling',
 'anyway',
 'pretty',
 'much',
 'calls',
 'time',
 'frustrated',
 'something',
 'boyfriend',
 'ask',
 'logical',
 'would',
 'fight',
 'would',
 'call',
 'crazy',
 'asks',
 'ok',
 'say',
 'please',
 'said',
 'hand',
 'remote']

### Lemmatazation 
##### In this chapter, I will translate the words that I have pre-determined into their basic forms.

In [14]:
nltk.download("wordnet")
nltk.download("omw-1.4")

[nltk_data] Downloading package wordnet to /home/nicholas/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/nicholas/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [15]:
import nltk as nlp
lemma = nlp.WordNetLemmatizer()
Res_text = [lemma.lemmatize(word) for word in Res_text]
Res_text = " ".join(Res_text)

Words have backed to their basic form.

In [16]:
Res_text

'really one friend fit category therapist call spiraling anyway pretty much call time frustrated something boyfriend ask logical would fight would call crazy asks ok say please said hand remote'

#### Words 
###### In this section, I will apply the above studies to the entire data set. I'm getting inside the list of the latest versions of the data.

In [17]:
description_list = [] # List
for description in data.response_text:
       
    description = re.sub("[^a-zA-Z]"," ",description)
    description = description.lower() 
    
    description = nltk.word_tokenize(description)
    description = [ word for word in description if not word in set(stopwords.words("english"))]
    
    lemma = nlp.WordNetLemmatizer()
    description = [ lemma.lemmatize(word) for word in description]
    
    description = " ".join(description)
    description_list.append(description)

In [18]:
description_list[4]

'really one friend fit category therapist call spiraling anyway pretty much call time frustrated something boyfriend ask logical would fight would call crazy asks ok say please said hand remote'

#### Bag of Words 
##### I find the most used words in the list.

In [19]:
from sklearn.feature_extraction.text import CountVectorizer
max_feature = 500

cv = CountVectorizer(max_features = max_feature, stop_words = "english")

space_matrix = cv.fit_transform(description_list).toarray() # x

print("Most frequently used {} words {}".format(max_feature, cv.get_feature_names_out()))

Most frequently used 500 words ['able' 'absolutely' 'acquaintance' 'acted' 'action' 'activity'
 'addiction' 'adequate' 'admit' 'advice' 'advise' 'age' 'ago' 'agony'
 'alcoholic' 'allowed' 'anniversary' 'answer' 'anxiety' 'anxious'
 'apposed' 'ask' 'asks' 'attention' 'aunt' 'avoid' 'away' 'bad'
 'basically' 'bedroom' 'best' 'better' 'big' 'bit' 'blow' 'blue' 'blunt'
 'book' 'boyfriend' 'break' 'bring' 'brother' 'brought' 'bunch' 'called'
 'calling' 'calm' 'came' 'camp' 'camping' 'campsite' 'cancer' 'car' 'care'
 'caring' 'category' 'caught' 'cause' 'chance' 'change' 'changed' 'chat'
 'circumstance' 'clean' 'cleaning' 'cocaine' 'come' 'comfort' 'comforted'
 'commit' 'common' 'complete' 'completely' 'concern' 'confines' 'conflict'
 'convinced' 'cop' 'cope' 'counselor' 'countless' 'couple' 'crazy'
 'cutter' 'cutting' 'damn' 'dating' 'day' 'dealing' 'dealt' 'death'
 'defined' 'depressed' 'depression' 'describes' 'desire' 'diagnosed'
 'dialog' 'difficulty' 'disorder' 'doc' 'dont' 'douche' 'd