In [3]:
# PERFORM TEXT VECTORIZATION...

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\cyber bulling dataset\data.csv")

# FUNCTION FOR PREPROCESSING
def preprocess_text(text):
    if isinstance(text, str):
        # REMOVE URLS
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        # CONVERT TO LOWER CASE
        text = text.lower()

        # REMOVE SPECIAL CHARACTERS
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)

        return text
    elif isinstance(text, float) and np.isnan(text):
        return ""
    else:
        return str(text)

# SELECT TEXT COLUMN FOR PROCESS
text_columns = [
    "Email Address",
    "What do you consider as Cyber bullying?",
    "If Yes , What was the way you dealt with the same",
    "What do you think made you the victim of cyber bullying?",
    "What Platform according to you leads in cyber bullying",
    "What type of people are usually the bullies on the internet?",
    "What lead you to bully someone ? ( if yes )",
    "What kind of people are usually the victims of cyber-bullying?",
    "What are the causes of cyber bullying according to you",
    "Who are bullies usually?",
    "Youtube Roasting is a form of bullying. Do you agree ?",
]

# PREPROCESS TEXT
for column in text_columns:
    df[column] = df[column].apply(preprocess_text)

# TEXT VECTORIZATION
vectorizer = TfidfVectorizer()
text_data = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
vectorized_text = vectorizer.fit_transform(text_data)

# GET FEATURE NAME
feature_names = vectorizer.vocabulary_

# CONVERT THE VECTORIZED TEXT TO A DATA FRAME
vectorized_df = pd.DataFrame(vectorized_text.toarray(), columns=feature_names)

# CONCATENATE VACTORIZED DATAFRAME WITH THE ORGINAL DATAFRAME
df = pd.concat([df, vectorized_df], axis=1)

print(df.head())


            Timestamp               Email Address What's Your Age?  \
0  1/22/2022 11:39:31      armadhav2007 gmail com          15 - 18   
1  1/22/2022 11:40:39  nikhilkumar38674 gmail com         Above 18   
2  1/22/2022 11:41:15      guptasayyam7 gmail com          15 - 18   
3  1/22/2022 11:41:57      sonaissuhani gmail com          15 - 18   
4  1/22/2022 11:42:02        madmaxhu69 gmail com          15 - 18   

             What do you consider as Cyber bullying?  \
0  hacking   phishing your private details  spamm...   
1  hacking   phishing your private details  spamm...   
2  spamming   unlawfully morphing your images  me...   
3  mean inappropriate comments  pranks calls by s...   
4  mean inappropriate comments  spamming   unlawf...   

  What Age group of people are most Vulnerable to Cyber bullying  \
0                                              19-30               
1                                 equally vulnerable               
2                                     

In [6]:
#PERFORM RNNs MODEL TO PREDICT-What kind of people are usually the victims of cyber-bullying?

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder


# SEED FOR RANDAOM NUMBER GENERATOR IN NUMPY
np.random.seed(42)

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\cyber bulling dataset\data.csv")


# FUNCTION FOR PREPROCESSING

def preprocess_text(text):
    if isinstance(text, str):
        
        # REMOVE URLS
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        # CONVERT TO LOWER CASE
        text = text.lower()

        # REMOVE SPECIAL CHARACTERS
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)

        return text
    elif isinstance(text, float) and np.isnan(text):
        return ""
    else:
        return str(text)

    
    
    
# SELECT TEXT COLUMN FOR PROCESS

text_columns = [
    "Email Address",
    "What do you consider as Cyber bullying?",
    "If Yes , What was the way you dealt with the same",
    "What do you think made you the victim of cyber bullying?",
    "What Platform according to you leads in cyber bullying",
    "What type of people are usually the bullies on the internet?",
    "What lead you to bully someone ? ( if yes )",
    "What kind of people are usually the victims of cyber-bullying?",
    "What are the causes of cyber bullying according to you",
    "Who are bullies usually?",
    "Youtube Roasting is a form of bullying. Do you agree ?",
]

# PREPROCESS TEXT

for column in text_columns:
    df[column] = df[column].apply(preprocess_text)

# Split the data into training and testing sets
X = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
y = df['What kind of people are usually the victims of cyber-bullying?'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TOKENIZE THE TEXT DATA

tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

# PAD SEQUENCE TO HAVE THE SAME LENGTH

max_sequence_length = max(len(seq) for seq in X_train_tokens)
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_sequence_length)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_sequence_length)

# CONVERT TARGERT VARIABLE TO INTEGER

label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)
num_classes = len(label_encoder.classes_)

# ONE HOT ENCODEING TO TARGET VARIABLE
y_train_encoded = to_categorical(y_train_encoded, num_classes=num_classes)
y_test_encoded = to_categorical(y_test_encoded, num_classes=num_classes)

# RNN MODEL
embedding_dim = 100  # Dimensionality of the word embeddings
vocab_size = len(tokenizer.word_index) + 1

model = Sequential()
model.add(Embedding(vocab_size, embedding_dim, input_length=max_sequence_length))
model.add(LSTM(100))
model.add(Dense(num_classes, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# TRAIN THE MODEL
model.fit(X_train_pad, y_train_encoded, validation_data=(X_test_pad, y_test_encoded), epochs=100, batch_size=32)

# EVALUATE
loss, accuracy = model.evaluate(X_test_pad, y_test_encoded, verbose=0)
print(f'Test Loss: {loss:.4f}')
print(f'Test Accuracy: {accuracy:.4f}')


Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

In [19]:
#PEFROM RNNs MODEL TO PREDICT MULTIPLE  TARGET VARIABLE .....

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

#SEED FOR RANDAOM NUMBER GENERATOR IN NUMPY

np.random.seed(42)

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\cyber bulling dataset\data.csv")


# FUNCTION


def preprocess_text(text):
    if isinstance(text, str):
        # REMOVE URLS
        text = re.sub(r"http\S+|www\S+|https\S+", "", text )

        # CONVERT TO LOWER CASE
        text = text.lower()

        # REMOVE SPECIAL CHARACTERS
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)

        return text
    elif isinstance(text, pd.Series):
        # Apply the preprocess_text function to each element in the Series
        return text.apply(lambda x: preprocess_text(x))
    else:
        return ""


# SELECT TEXT COLUMN FOR PROCESS
text_columns = [
    "Email Address",
    "What do you consider as Cyber bullying?",
    "If Yes , What was the way you dealt with the same",
    "What do you think made you the victim of cyber bullying?",
    "What Platform according to you leads in cyber bullying",
    "What type of people are usually the bullies on the internet?",
    "What lead you to bully someone ? ( if yes )",
    "What kind of people are usually the victims of cyber-bullying?",
    "What are the causes of cyber bullying according to you",
    "Who are bullies usually?",
    "Youtube Roasting is a form of bullying. Do you agree ?",
]

# PREPROCESS THE TEXT
for column in text_columns:
    df[column] = preprocess_text(df[column])
    

# SPLIT DATA TRAINING AND TESTING SET
X = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
y = df[text_columns]

# TOKENIZE THE DATA
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

# PAD SEQUENCE THE INPUT DATA
max_input_sequence_length = 100  
X_pad = pad_sequences(X_seq, maxlen=max_input_sequence_length)



# ONE HOT ENCODING TO TARGET VARIABLE


label_encoders = {}
y_pad = []
for column in text_columns:
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y[column])
    y_pad.append(to_categorical(y_encoded))
    label_encoders[column] = label_encoder

    
    
# RNN MODEL WITH MULTIPLE LAYER


vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Adjust as needed

models = []
for i, column in enumerate(text_columns):
    num_classes = len(label_encoders[column].classes_)
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_input_sequence_length))
    model.add(LSTM(100))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    models.append(model)

    
    
# TRAIN THE MODEL


for i, model in enumerate(models):
    print(f"Training model for column: {text_columns[i]}")
    model.fit(X_pad, y_pad[i], validation_split=0.2, epochs=1000, batch_size=32)

# EVALUATE


for i, model in enumerate(models):
    loss, accuracy = model.evaluate(X_pad, y_pad[i], verbose=0)
    print(f'Test Loss for column {text_columns[i]}: {loss:.4f}')
    print(f'Test Accuracy for column {text_columns[i]}: {accuracy:.4f}')


Training model for column: Email Address
Epoch 1/1000
Epoch 2/1000
Epoch 3/1000
Epoch 4/1000
Epoch 5/1000
Epoch 6/1000
Epoch 7/1000
Epoch 8/1000
Epoch 9/1000
Epoch 10/1000
Epoch 11/1000
Epoch 12/1000
Epoch 13/1000
Epoch 14/1000
Epoch 15/1000
Epoch 16/1000
Epoch 17/1000
Epoch 18/1000
Epoch 19/1000
Epoch 20/1000
Epoch 21/1000
Epoch 22/1000
Epoch 23/1000
Epoch 24/1000
Epoch 25/1000
Epoch 26/1000
Epoch 27/1000
Epoch 28/1000
Epoch 29/1000
Epoch 30/1000
Epoch 31/1000
Epoch 32/1000
Epoch 33/1000
Epoch 34/1000
Epoch 35/1000
Epoch 36/1000
Epoch 37/1000
Epoch 38/1000
Epoch 39/1000
Epoch 40/1000
Epoch 41/1000
Epoch 42/1000
Epoch 43/1000
Epoch 44/1000
Epoch 45/1000
Epoch 46/1000
Epoch 47/1000
Epoch 48/1000
Epoch 49/1000
Epoch 50/1000
Epoch 51/1000
Epoch 52/1000
Epoch 53/1000
Epoch 54/1000
Epoch 55/1000
Epoch 56/1000
Epoch 57/1000
Epoch 58/1000
Epoch 59/1000
Epoch 60/1000
Epoch 61/1000
Epoch 62/1000
Epoch 63/1000
Epoch 64/1000
Epoch 65/1000
Epoch 66/1000
Epoch 67/1000
Epoch 68/1000
Epoch 69/1000
Ep

In [21]:
#MODEL INTERECT WITH USER.....

import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from sklearn.model_selection import train_test_split
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from sklearn.preprocessing import LabelEncoder

# Set seed for reproducibility
np.random.seed(42)

# DATASET
df = pd.read_csv(r"C:\Users\nh013\Desktop\cyber bulling dataset\data.csv")

def preprocess_text(text):
    if isinstance(text, str):
        # REMOVE URLS
        text = re.sub(r"http\S+|www\S+|https\S+", "", text)

        # CONVERT TO LOWER CASE
        text = text.lower()

        # REMOVE SPECIAL CHARACTERS
        text = re.sub(r"[^a-zA-Z0-9]", " ", text)

        return text
    elif isinstance(text, pd.Series):
        # Apply the preprocess_text function to each element in the Series
        return text.apply(lambda x: preprocess_text(x))
    else:
        return ""

# SELECT TEXT COLUMN FOR PROCESS
text_columns = [
    "Email Address",
    "What do you consider as Cyber bullying?",
    "If Yes , What was the way you dealt with the same",
    "What do you think made you the victim of cyber bullying?",
    "What Platform according to you leads in cyber bullying",
    "What type of people are usually the bullies on the internet?",
    "What lead you to bully someone ? ( if yes )",
    "What kind of people are usually the victims of cyber-bullying?",
    "What are the causes of cyber bullying according to you",
    "Who are bullies usually?",
    "Youtube Roasting is a form of bullying. Do you agree ?",
]

# PREPROCESS THE TEXT
for column in text_columns:
    df[column] = preprocess_text(df[column])

# SPLIT DATA INTO TRAINING AND TESTING SET
X = df[text_columns].apply(lambda x: ' '.join(x), axis=1)
y = df[text_columns]

# TOKENIZE THE DATA
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)

#PAD SEQUENCE FOR THE INPUT DATA
max_input_sequence_length = 100  # Adjust as needed
X_pad = pad_sequences(X_seq, maxlen=max_input_sequence_length)

# ONE HOT ENCODING FOR TARGET VARIABLE
label_encoders = {}
y_pad = []
for column in text_columns:
    label_encoder = LabelEncoder()
    y_encoded = label_encoder.fit_transform(y[column])
    y_pad.append(to_categorical(y_encoded))
    label_encoders[column] = label_encoder

# RNN MODEL WITH MULTIPLE OUTPUT LAYER
vocab_size = len(tokenizer.word_index) + 1
embedding_dim = 100  # Adjust as needed

models = []
for i, column in enumerate(text_columns):
    num_classes = len(label_encoders[column].classes_)
    model = Sequential()
    model.add(Embedding(vocab_size, embedding_dim, input_length=max_input_sequence_length))
    model.add(LSTM(100))
    model.add(Dense(num_classes, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    models.append(model)

# TRAIN MODEL
for i, model in enumerate(models):
    print(f"Training model for column: {text_columns[i]}")
    model.fit(X_pad, y_pad[i], validation_split=0.2, epochs=100, batch_size=32)

# EVALUATE MODEL
for i, model in enumerate(models):
    loss, accuracy = model.evaluate(X_pad, y_pad[i], verbose=0)
    print(f'Test Loss for column {text_columns[i]}: {loss:.4f}')
    print(f'Test Accuracy for column {text_columns[i]}: {accuracy:.4f}')

# FUNCTION TO GET PREDICTION FROM MODEL
def get_predictions(text):
    # Preprocess the input text
    preprocessed_text = preprocess_text(text)
    
    # TOKENIZE PREPROCESS TEXT
    text_seq = tokenizer.texts_to_sequences([preprocessed_text])
    
    # PAD SEQUENCE
    text_pad = pad_sequences(text_seq, maxlen=max_input_sequence_length)
    
    # WILL GET PREDICTION FROM EACH MODEL
    predictions = {}
    for i, column in enumerate(text_columns):
        prediction = models[i].predict(text_pad)
        predicted_class = label_encoders[column].inverse_transform(np.argmax(prediction, axis=1))[0]
        predictions[column] = predicted_class
    
    return predictions

# INTERECTIVE LOOP   GET USER INPUT AND MAKE PREDICTION
while True:
    user_input = input("Enter your text: ")
    if user_input.lower() == "exit":
        break
    else:
        predictions = get_predictions(user_input)
        print("Predictions:")
        for column, predicted_class in predictions.items():
            print(f"{column}: {predicted_class}")


Training model for column: Email Address
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 