In [1]:
# install missing packages
%pip install plotly
%pip install py-cpuinfo
%pip install pandas-datareader
%pip install nltk

Collecting plotly
  Obtaining dependency information for plotly from https://files.pythonhosted.org/packages/a8/07/72953cf70e3bd3a24cbc3e743e6f8539abe6e3e6d83c3c0c83426eaffd39/plotly-5.18.0-py3-none-any.whl.metadata
  Using cached plotly-5.18.0-py3-none-any.whl.metadata (7.0 kB)
Collecting tenacity>=6.2.0 (from plotly)
  Obtaining dependency information for tenacity>=6.2.0 from https://files.pythonhosted.org/packages/f4/f1/990741d5bb2487d529d20a433210ffa136a367751e454214013b441c4575/tenacity-8.2.3-py3-none-any.whl.metadata
  Using cached tenacity-8.2.3-py3-none-any.whl.metadata (1.0 kB)
Using cached plotly-5.18.0-py3-none-any.whl (15.6 MB)
Using cached tenacity-8.2.3-py3-none-any.whl (24 kB)
Installing collected packages: tenacity, plotly
Successfully installed plotly-5.18.0 tenacity-8.2.3
Note: you may need to restart the kernel to use updated packages.
Collecting py-cpuinfo
  Using cached py_cpuinfo-9.0.0-py3-none-any.whl (22 kB)
Installing collected packages: py-cpuinfo
Successfully

In [2]:
# check system details
import os
import psutil
import cpuinfo

ram_info = psutil.virtual_memory()
print(f"Total RAM: {ram_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available RAM: {ram_info.available / 1024 / 1024 / 1024:.2f} GB")
print(f"Used RAM: {ram_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of RAM: {ram_info.percent}%")
print(f"CPU Cores: {os.cpu_count()}")
print(f"CPU Speed: {cpuinfo.get_cpu_info()['hz_actual_friendly']}")
disk_info = psutil.disk_usage(os.getcwd())
print(f"Total Disk: {disk_info.total / 1024 / 1024 / 1024:.2f} GB")
print(f"Available Disk: {disk_info.free / 1024 / 1024 / 1024:.2f} GB")
print(f"Used Disk: {disk_info.used / 1024 / 1024 / 1024:.2f} GB")
print(f"Percentage Usage Of Disk: {disk_info.percent}%")

Total RAM: 15.47 GB
Available RAM: 14.07 GB
Used RAM: 1.14 GB
Percentage Usage Of RAM: 9.0%
CPU Cores: 4
CPU Speed: 2.5000 GHz
Total Disk: 24.99 GB
Available Disk: 18.69 GB
Used Disk: 6.30 GB
Percentage Usage Of Disk: 25.2%


In [3]:
# import requirements
import numpy as np
import pandas as pd
from sklearn.metrics import confusion_matrix, accuracy_score, f1_score
from gbm_classifier import Classification

[nltk_data] Downloading package vader_lexicon to /home/studio-lab-
[nltk_data]     user/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
# get the data
mbti = pd.read_csv("mbti_labels.csv")
users = pd.read_csv("user_info.csv")
tweets = pd.read_csv("user_tweets.csv")

In [5]:
# remove unnecessary columns from users
users = users.drop(columns=["id_str", "name", "screen_name", "location"])

In [6]:
# # only keep the first tweets
# tweets = tweets.iloc[:, :2]

In [7]:
# # merge all the tweets together
# tweet = pd.DataFrame()
# for i in range(tweets.shape[0]):
#     Id = tweets["id"][i]
#     text = " ".join(tweets.iloc[i, 1:].astype(str).tolist())
#     tweet = pd.concat([
#         tweet, 
#         pd.DataFrame({"id": [Id], "tweet": [text]}),
#     ], axis="index").reset_index(drop=True)

In [8]:
# join the data together
mbti["id"] = mbti["id"].astype(str)
users["id"] = users["id"].astype(str)
# tweet["id"] = tweet["id"].astype(str)
personality = mbti.merge(right=users, how="left", on="id")
# personality = personality.merge(right=tweet, how="left", on="id")

In [9]:
# fill in missing values with None
personality = personality.fillna("None")

In [10]:
# remove id
personality = personality.drop(columns="id")

In [11]:
# make verified a binary variable
personality["verified"] = personality["verified"] * 1

In [12]:
# shuffle the data
personality = personality.sample(frac=1, random_state=0).reset_index(drop=True)

In [13]:
# get the testing data
y = personality[["mbti_personality"]]
X = personality.drop(columns="mbti_personality")
testX = X.tail(int(0.2 * X.shape[0])).reset_index(drop=True)
testy = y.tail(int(0.2 * y.shape[0])).reset_index(drop=True)

In [14]:
# build the model
print("\n---- Personality Classification Analysis ----\n")
model = Classification(
    name="XGBoost Without Feature Engineering", 
    path=None,
    rename=True, 
    time=False, 
    text=True,
    binary=False, 
    imputation=False, 
    variance=True,
    atwood=False,
    binning=False,
    reciprocal=False, 
    interaction=False, 
    selection=True,
    tune=True,
    plots=True,
)
try:
    model.load()  # load the machine learning pipeline
    predictions = model.predict(testX)
except:
    model.explore(personality)
    model.validate(X, y)  # build the machine learning pipeline
    predictions = model.predict(testX)
    print("\nModel Performance:")
    print(f"Accuracy: {model.accuracy}")
    print(f"F1: {model.f1}")
    print(f"In Control: {model.in_control}")
    print("Confusion Matrix:")
    pd.set_option("display.width", 1000)
    print(model.confusion / model.confusion.sum().sum())


---- Personality Classification Analysis ----

Visualizing The Data:
> Plotting Correlations
> total_retweet_count vs. average_retweet_count
> total_favorite_count vs. average_favorite_count
> total_hashtag_count vs. average_hashtag_count
> total_url_count vs. average_url_count
> total_mentions_count vs. average_mentions_count
> total_media_count vs. average_media_count
> Plotting followers_count
> Plotting friends_count
> Plotting listed_count
> Plotting favourites_count
> Plotting statuses_count
> Plotting number_of_quoted_statuses
> Plotting number_of_retweeted_statuses
> Plotting total_retweet_count
> Plotting total_favorite_count
> Plotting total_hashtag_count
> Plotting total_url_count
> Plotting total_mentions_count
> Plotting total_media_count
> Plotting number_of_tweets_scraped
> Plotting average_tweet_length
> Plotting average_retweet_count
> Plotting average_favorite_count
> Plotting average_hashtag_count
> Plotting average_url_count
> Plotting average_mentions_count
> Plot

In [15]:
# model diagnostics
print("Model Indicators:")
for i, indicator in enumerate(model.indicators["Indicator"][:10].tolist()):
    print(f"{i+1}. {indicator}")
print(" ")
print("Feature Drift:")
for i, feature in enumerate(model.drift.loc[model.drift["pvalue"] < 0.05, "Feature"][:10].tolist()):
    print(f"{i+1}. {feature}")
if model.drift.loc[model.drift["pvalue"] < 0.05].shape[0] == 0:
    print("None")

Model Indicators:
1. description_intj
2. description_infj
3. description_enfp
4. description_infp
5. description_enfj
6. description_entj
7. description_intp
8. description_entp
9. description_isfj
10. description_istj
 
Feature Drift:
1. average_media_count


In [16]:
# score the model
accuracy = accuracy_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
)
f1 = f1_score(
    y_true=testy.iloc[:,0].to_numpy(),
    y_pred=predictions,
    average="macro",
)

print(f"Accuracy: {accuracy}")
print(f"F1: {f1}")

Accuracy: 0.8468468468468469
F1: 0.8315152346864694


In [17]:
# show the confusion matrix
ytest = testy.iloc[:,0].to_numpy()
labels = np.unique(np.concatenate((predictions, ytest)))
confusion = confusion_matrix(
    y_true=ytest,   # rows
    y_pred=predictions,  # columns
    labels=labels,
)
confusion = pd.DataFrame(
    confusion, 
    columns=labels, 
    index=labels,
)
print("Confusion Matrix:")
pd.set_option("display.width", 1000)
print(confusion / confusion.sum().sum())

Confusion Matrix:
          enfj      enfp      entj      entp      esfj      esfp      estj      estp      infj      infp      intj      intp      isfj      isfp      istj      istp
enfj  0.078078  0.003604  0.001802  0.000601  0.000000  0.000000  0.000000  0.000000  0.001201  0.000000  0.000000  0.002402  0.000000  0.000000  0.000000  0.000601
enfp  0.002402  0.084084  0.000601  0.002402  0.000000  0.000000  0.000000  0.001201  0.002402  0.002402  0.000000  0.003604  0.000000  0.000000  0.000000  0.000000
entj  0.002402  0.001201  0.067868  0.002402  0.000000  0.000000  0.000000  0.000000  0.001802  0.000000  0.001201  0.003003  0.000000  0.000601  0.000000  0.000601
entp  0.003003  0.001802  0.001201  0.063664  0.000000  0.000601  0.000000  0.000000  0.001201  0.000601  0.000000  0.004204  0.000000  0.000000  0.001201  0.001201
esfj  0.001201  0.001201  0.000601  0.000000  0.025225  0.000601  0.000000  0.000000  0.001802  0.000000  0.000000  0.002402  0.000000  0.000000  0.000000  0

In [18]:
# save the machine learning pipeline
model.dump()

In [19]:
# refit the model to include the test data
model.refit(testX, testy)

Model Retraining:
> Transforming The Updated Data
> Renaming Features
> Transforming Text Features
> Renaming Features
> Removing Constant Features
> Selecting Features


KeyboardInterrupt: 