<a href="https://colab.research.google.com/github/mazinkamal134/DS_MRP_2024/blob/main/TensiStrength/3_TensiStrength_Combine_%26_Finalize.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Please note:
- Use this notebook to combine the files generated by the previous step in this pipeline.
- Also use it to rescale the relax/stress score and add a combined score.

In [None]:
import pandas as pd
import numpy as np
import pickle
import re
import html
import os
from sklearn.preprocessing import MinMaxScaler

## Global Params

In [None]:
tensiStrenghtChunksDir = r"/TensiStrenght/Chunks"
tensiStrenghtDir = r"/TensiStrenght/"
numberOfFiles = 5
disorder = "anxiety" # ["depression", "ptsd"]

## Read the files chunks & combine

In [None]:
# Read one file to infer the dataframe schema
fileName = f"{disorder}TweetsDfWithTensiStrengthScore_0.pickle"
filePath = os.path.join(tensiStrenghtChunksDir, fileName)
with open(filePath, 'rb') as f:
    df = pickle.load(f)
containerDf = df.copy(deep = True)
containerDf.drop(containerDf.index, inplace = True)
toInvestigateDf = df.copy(deep = True)
toInvestigateDf.drop(toInvestigateDf.index, inplace = True)
del df
print("Shape:", toInvestigateDf.shape)

In [None]:
# Loop throguh the files, read and combine
for i in range(numberOfFiles):
    fileName = os.path.join(tensiStrenghtChunksDir, f"{disorder}TweetsDfWithTensiStrengthScore_{i}.pickle")
    print("Processing:", fileName)
    with open(fileName, 'rb') as f:
        fullDf = pickle.load(f)
    # Add to the main Df
    containerDf = pd.concat([containerDf, fullDf[fullDf.relaxScore.notna()]])
    toInvestigateDf = pd.concat([toInvestigateDf, fullDf[fullDf.relaxScore.isna()]])
    print("Done processing:", fileName)

print("Shape of scored:", containerDf.shape)
print("Shape of not scored:", toInvestigateDf.shape)
containerDf.sample()

## Finalize


*   Fix the data types
*   Adjust the stress scores (change the scale to 0 - 4)
*   Add a combined score using the relax and sterss scores



In [None]:
# Fix the data types
containerDf["relaxScore"] = containerDf["relaxScore"].astype("int64")
containerDf["stressScore"] = containerDf["stressScore"].astype("int64")

# Create new column using the relax/stress score
containerDf["relaxScoreAltered"] = containerDf["relaxScore"].apply(lambda x: x - 1)
containerDf["stressScoreAltered"] = containerDf["stressScore"].apply(lambda x: abs(x + 1))

# Use min max scaler on the data
scaler = MinMaxScaler(feature_range=(0, 1))
# Scale
containerDf["scaledScore"] = scaler.fit_transform(np.array(containerDf["relaxScore"] + containerDf["stressScore"]).reshape(-1, 1))
# Invert the scores
containerDf["scaledScore"] = 1 - containerDf["scaledScore"]

# Rename some of the columns
renamed = {"relaxScore": "relax_score_org",
          "stressScore": "stress_score_org",
          "scaledScore": "combined_score",
          "stressScoreAltered": "stress_score",
          "relaxScoreAltered": "relax_score"
          }
containerDf.rename(columns = renamed, inplace = True)
# Organize the columns
cols = ["id", "tweet_type", "referenced_tweet_type", "created_at", "lang", "disorder", "group", "author_id", "text", "cleaned_text", "retweet_count", "reply_count", "like_count", "quote_count", "source", "group", "relax_score_org", "stress_score_org", "relax_score", "stress_score", "combined_score"]
containerDf = containerDf[cols]

# Check
containerDf.sample()

## Save

In [None]:
# Save to pickle
fileName = f"{disorder}FullWithTensiScore.pickle"
containerDf.to_pickle(os.path.join(tensiStrenghtDir, fileName))

Next step is to use the disorder final files to update the master dataset on the main data pipeline