### Obtaining and Loading Data

In [1]:
import pandas as pd

df = pd.read_csv('../data/mental_health.csv')
df.head()


Unnamed: 0,text,label
0,dear american teens question dutch person hear...,0
1,nothing look forward lifei dont many reasons k...,1
2,music recommendations im looking expand playli...,0
3,im done trying feel betterthe reason im still ...,1
4,worried year old girl subject domestic physic...,1


### Exploratory Data Analysis

In [2]:
df.info()
df.label.value_counts()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27977 entries, 0 to 27976
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    27977 non-null  object
 1   label   27977 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 437.3+ KB


0    14139
1    13838
Name: label, dtype: int64

### Model Fitting

In [3]:
from sklearn.model_selection import train_test_split

X = df["text"]
y = df["label"]

X_train, X_test, y_train, y_test=train_test_split(X, y, test_size=0.2, random_state=17)


In [4]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

pipe = Pipeline([('tfidf', TfidfVectorizer()), ('model', MultinomialNB())])
pipe.fit(X_train, y_train)



In [5]:
from sklearn import metrics

predictions = pipe.predict(X_test)
accuracy = metrics.accuracy_score(y_true=y_test, y_pred=predictions)
print(accuracy)

confusion = metrics.confusion_matrix(y_true=y_test, y_pred=predictions)
confusion

0.8398856325947105


array([[1969,  862],
       [  34, 2731]])

In [6]:
q = "nothing look forward lifei dont many reasons"
a = pipe.predict([q])
a

array([1])