In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
import warnings
warnings.filterwarnings('ignore')

# Load the dataset
file_path = '/content/mypersonality_final.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')

# Display the first few rows of the dataset to understand its structure
print(df.head())

                            #AUTHID  \
0  b7b7764cfa1c523e4e93ab2a79a946c4   
1  b7b7764cfa1c523e4e93ab2a79a946c4   
2  b7b7764cfa1c523e4e93ab2a79a946c4   
3  b7b7764cfa1c523e4e93ab2a79a946c4   
4  b7b7764cfa1c523e4e93ab2a79a946c4   

                                              STATUS  sEXT  sNEU  sAGR  sCON  \
0                        likes the sound of thunder.  2.65   3.0  3.15  3.25   
1  is so sleepy it's not even funny that's she ca...  2.65   3.0  3.15  3.25   
2  is sore and wants the knot of muscles at the b...  2.65   3.0  3.15  3.25   
3         likes how the day sounds in this new song.  2.65   3.0  3.15  3.25   
4                                        is home. <3  2.65   3.0  3.15  3.25   

   sOPN cEXT cNEU cAGR cCON cOPN               DATE  NETWORKSIZE  BETWEENNESS  \
0   4.4    n    y    n    n    y  06/19/09 03:21 PM        180.0      14861.6   
1   4.4    n    y    n    n    y   07-02-2009 08:41        180.0      14861.6   
2   4.4    n    y    n    n    y  06/15/0

In [None]:
print(df.isnull().sum())

#AUTHID         4919
STATUS          4919
sEXT            4919
sNEU            4919
sAGR            4919
sCON            4919
sOPN            4919
cEXT            4919
cNEU            4919
cAGR            4919
cCON            4919
cOPN            4919
DATE            4919
NETWORKSIZE     4919
BETWEENNESS     4919
NBETWEENNESS    4919
DENSITY         4919
BROKERAGE       4919
NBROKERAGE      4919
TRANSITIVITY    4919
dtype: int64


In [None]:
# Drop rows where 'STATUS' column is null
df = df.dropna(subset=['STATUS'])

# Fill missing values in the personality trait columns with the median value
for col in ['sOPN', 'sCON', 'sEXT', 'sAGR', 'sNEU']:
    df[col] = df[col].fillna(df[col].median())

# Optional: If you want to replace the null 'STATUS' with an empty string instead of dropping
# df['STATUS'] = df['STATUS'].fillna('')

# Verify there are no more missing values
print(df.isnull().sum())


#AUTHID         0
STATUS          0
sEXT            0
sNEU            0
sAGR            0
sCON            0
sOPN            0
cEXT            0
cNEU            0
cAGR            0
cCON            0
cOPN            0
DATE            0
NETWORKSIZE     0
BETWEENNESS     0
NBETWEENNESS    0
DENSITY         0
BROKERAGE       0
NBROKERAGE      0
TRANSITIVITY    0
dtype: int64


In [None]:

# Download the stopwords from NLTK
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# Function to clean text
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    # Remove punctuations
    text = re.sub(r'\W', ' ', text)
    # Remove numbers
    text = re.sub(r'\d+', '', text)
    # Remove additional whitespaces
    text = re.sub(r'\s+', ' ', text).strip()
    # Convert to lowercase
    text = text.lower()
    # Remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

# Apply the cleaning function to the 'STATUS' column
df['cleaned_status'] = df['STATUS'].apply(lambda x: clean_text(str(x)))

# Display the cleaned status
df[['STATUS', 'cleaned_status']].head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Unnamed: 0,STATUS,cleaned_status
0,likes the sound of thunder.,likes sound thunder
1,is so sleepy it's not even funny that's she ca...,sleepy even funny get sleep
2,is sore and wants the knot of muscles at the b...,sore wants knot muscles base neck stop hurting...
3,likes how the day sounds in this new song.,likes day sounds new song
4,is home. <3,home


In [None]:
!pip install keras tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer

# Initialize the tokenizer with a limit on the number of words
tokenizer = Tokenizer(num_words=10000, oov_token='<OOV>')

# Fit the tokenizer on the cleaned text
tokenizer.fit_on_texts(df['cleaned_status'])

# Convert text to sequences (integer encoding)
sequences = tokenizer.texts_to_sequences(df['cleaned_status'])

# Display the first few tokenized sequences
print(sequences[:5])

[[762, 684, 763], [441, 56, 283, 8, 40], [510, 45, 4034, 2590, 1856, 1411, 143, 2591, 511, 78, 1857], [762, 3, 866, 12, 340], [24]]


In [None]:
from keras.preprocessing.sequence import pad_sequences

# Pad sequences to ensure uniform length
max_length = 100  # Define max length for padding
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Display the padded sequences
print(padded_sequences[:5])


[[ 762  684  763    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0]
 [ 441   56  283    8   40    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0    0    0    0    0
     0    0    0    0    0    0    0    0    0    0

In [None]:
from sklearn.model_selection import train_test_split

# Assuming padded_sequences is already defined
# Define the personality traits you want to model
traits = ['sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN']

# Store results for each trait
results = {}

# Loop through each trait
for trait in traits:
    # Define the target variable for the current trait
    y = df[trait]

    # Split the data into training and test sets
    X_train, X_test, y_train, y_test = train_test_split(padded_sequences, y, test_size=0.2, random_state=42)

    # Check the shape of the split datasets
    print(f"{trait} - Training set shape: {X_train.shape}, Test set shape: {X_test.shape}, y_train shape: {y_train.shape}, y_test shape: {y_test.shape}")

    # Here you can add your model training code for each trait
    # For example:
    # model.fit(X_train, y_train)
    # y_pred = model.predict(X_test)
    # You can calculate and store metrics here

    # You might also want to store the results in the results dictionary
    # results[trait] = {'accuracy': accuracy, 'precision': precision, 'recall': recall, 'f1_score': f1_score}


sEXT - Training set shape: (3998, 100), Test set shape: (1000, 100), y_train shape: (3998,), y_test shape: (1000,)
sNEU - Training set shape: (3998, 100), Test set shape: (1000, 100), y_train shape: (3998,), y_test shape: (1000,)
sAGR - Training set shape: (3998, 100), Test set shape: (1000, 100), y_train shape: (3998,), y_test shape: (1000,)
sCON - Training set shape: (3998, 100), Test set shape: (1000, 100), y_train shape: (3998,), y_test shape: (1000,)
sOPN - Training set shape: (3998, 100), Test set shape: (1000, 100), y_train shape: (3998,), y_test shape: (1000,)


In [None]:
!pip install catboost

Collecting catboost
  Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.7-cp310-cp310-manylinux2014_x86_64.whl (98.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.7/98.7 MB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.7


In [None]:
pip install tensorflow




In [None]:
!pip install keras




In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb
from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

# Example threshold for binary classification
threshold = 4  # Adjust this threshold for your data

# List of personality traits to classify
traits = ['sEXT', 'sNEU', 'sAGR', 'sOPN', 'sCON']

# Initialize classifiers
classifiers = {
    'AdaBoost': AdaBoostClassifier(n_estimators=100),
    'GradientBoosting': GradientBoostingClassifier(n_estimators=100),
    'XGBoost': xgb.XGBClassifier(n_estimators=100),
    'CatBoost': CatBoostClassifier(iterations=100, silent=True),
    'RandomForest': RandomForestClassifier(n_estimators=10, random_state=42)
}

# Split data into train/test sets
X_train, X_test, y_train, y_test = train_test_split(df['cleaned_status'], df[traits], test_size=0.2, random_state=42)

# Initialize CountVectorizer and TfidfVectorizer
count_vectorizer = CountVectorizer(max_features=10000)
tfidf_vectorizer = TfidfVectorizer(max_features=10000)

# Fit and transform for TF and TF-IDF
X_train_tf = count_vectorizer.fit_transform(X_train).toarray()
X_test_tf = count_vectorizer.transform(X_test).toarray()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train).toarray()
X_test_tfidf = tfidf_vectorizer.transform(X_test).toarray()

# Loop through each trait and convert the target to binary format
overall_accuracies = {}  # To store overall accuracies for each classifier

for trait in traits:
    # Convert the target variable to binary classes (discrete)
    y_train_binary = (y_train[trait] > threshold).astype(int)
    y_test_binary = (y_test[trait] > threshold).astype(int)

    # Combine TF and TF-IDF features
    X_train_combined = np.hstack((X_train_tf, X_train_tfidf))
    X_test_combined = np.hstack((X_test_tf, X_test_tfidf))

    # Evaluate classifiers for each trait
    print(f"--- Evaluating classifiers for {trait} ---")
    for name, classifier_model in classifiers.items():
        classifier_model.fit(X_train_combined, y_train_binary)
        accuracy = classifier_model.score(X_test_combined, y_test_binary)
        print(f"{name} Accuracy (TF + TF-IDF) for {trait}: {accuracy:.4f}")

        # Accumulate overall accuracy
        if name not in overall_accuracies:
            overall_accuracies[name] = []
        overall_accuracies[name].append(accuracy)

# Calculate and print overall accuracy for each classifier
print("\n--- Overall Accuracy ---")
for name, accuracies in overall_accuracies.items():
    avg_accuracy = np.mean(accuracies)
    print(f"{name} Overall Accuracy (average): {avg_accuracy:.4f}")


--- Evaluating classifiers for sEXT ---
AdaBoost Accuracy (TF + TF-IDF) for sEXT: 0.6920
GradientBoosting Accuracy (TF + TF-IDF) for sEXT: 0.7240
XGBoost Accuracy (TF + TF-IDF) for sEXT: 0.7210
CatBoost Accuracy (TF + TF-IDF) for sEXT: 0.7220
RandomForest Accuracy (TF + TF-IDF) for sEXT: 0.7090
--- Evaluating classifiers for sNEU ---
AdaBoost Accuracy (TF + TF-IDF) for sNEU: 0.9860
GradientBoosting Accuracy (TF + TF-IDF) for sNEU: 0.9800
XGBoost Accuracy (TF + TF-IDF) for sNEU: 0.9880
CatBoost Accuracy (TF + TF-IDF) for sNEU: 0.9920
RandomForest Accuracy (TF + TF-IDF) for sNEU: 0.9900
--- Evaluating classifiers for sAGR ---
AdaBoost Accuracy (TF + TF-IDF) for sAGR: 0.7310
GradientBoosting Accuracy (TF + TF-IDF) for sAGR: 0.7550
XGBoost Accuracy (TF + TF-IDF) for sAGR: 0.7440
CatBoost Accuracy (TF + TF-IDF) for sAGR: 0.7590
RandomForest Accuracy (TF + TF-IDF) for sAGR: 0.7580
--- Evaluating classifiers for sOPN ---
AdaBoost Accuracy (TF + TF-IDF) for sOPN: 0.7050
GradientBoosting Accura