In [3]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [4]:
def change_directory(path):
    print("Current Working Directory ", os.getcwd())
    os.chdir(path)
    print("Changed Working Directory ", os.getcwd())

In [5]:
def read_data(file):
    data = pd.read_csv(file)
    return data

In [6]:
label={              
'not_cyberbullying':0,                       
'gender':1,                 
'ethnicity':2,
'religion':3 , 
}
def labelencode(x):
    return label[x]

In [7]:
def validation_train_test(data):
    rest, validation = train_test_split(data, test_size = 0.1, random_state = 101, stratify = data['cyberbullying_type'])
    train, test = train_test_split(rest, test_size = 0.25, random_state = 101, stratify = rest['cyberbullying_type'])
    return train, test, validation

In [8]:
def tfidf_vector(x_train,x_test):
    tfidf_vectorizer=TfidfVectorizer(stop_words='english', max_df=0.7)
    tfidf_train=tfidf_vectorizer.fit_transform(x_train.values.astype('U')) 
    tfidf_test=tfidf_vectorizer.transform(x_test.values.astype('U'))
    return tfidf_train, tfidf_test

In [9]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Data\\raw_data"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Notebooks
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\raw_data


In [10]:
file = 'cleaned_data.csv'
data = read_data(file)
data.head()

Unnamed: 0,text,cyberbullying_type
0,long rantexplanation comment copeacefuls make ...,religion
1,toddmetcalf i 'm wondering conservative brothe...,not_cyberbullying
2,daveatherton bnsphrx hermannkelly robertsemons...,religion
3,rt trobinsonnewera nothing islam httptcoacqkux...,not_cyberbullying
4,stevesm mrsnickyclark billybragg thefamousartb...,gender


In [12]:
data['cyberbullying_type'] = data['cyberbullying_type'].values.astype('U')

In [13]:
data['cyberbullying_type'] = data['cyberbullying_type'].apply(labelencode)
data.head()

Unnamed: 0,text,cyberbullying_type
0,long rantexplanation comment copeacefuls make ...,3
1,toddmetcalf i 'm wondering conservative brothe...,0
2,daveatherton bnsphrx hermannkelly robertsemons...,3
3,rt trobinsonnewera nothing islam httptcoacqkux...,0
4,stevesm mrsnickyclark billybragg thefamousartb...,1


In [14]:
data.shape

(131867, 2)

In [15]:
train, test, validation = validation_train_test(data)

In [16]:
print(validation.shape)
print(test.shape)
print(train.shape)

(13187, 2)
(29670, 2)
(89010, 2)


In [17]:
path = "C:\\Users\\ompra\\OneDrive\\Documents\\Machine Learning Projects\\Automated Decision Support System for Cyberbullying Detection - Version 2.0\\Data\\processed_data"
change_directory(path)

Current Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\raw_data
Changed Working Directory  C:\Users\ompra\OneDrive\Documents\Machine Learning Projects\Automated Decision Support System for Cyberbullying Detection - Version 2.0\Data\processed_data


In [18]:
train.to_csv('train.csv', index = False)
test.to_csv('test.csv', index = False)
validation.to_csv('validation.csv', index = False)

In [19]:
train.shape

(89010, 2)

In [None]:
x_train = train['text']
x_test = test['text']

In [40]:
tfidf_train, tfidf_test = tfidf_vector(x_train,x_test)