## Import Dependencies

In [115]:
import json
import jsonlines  #for reading/writing the files
import re   #for cleaning
import nltk
from nltk.tokenize import word_tokenize  #for tokenization
from nltk.corpus import stopwords   #for stopwords removal
#from nltk.stem import WordNetLemmatizer 
import spacy  #for lemmatization
from keras.preprocessing.sequence import pad_sequences   #for padding sequences
from gensim.models import Word2Vec  #for word embedding
from sklearn.preprocessing import LabelEncoder  #for label encoding

## Path to Data files

In [3]:
# Path to the input JL files
train_ip_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/SMHDv1.1/SMHD_train.jl/train.jl'
test_ip_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/SMHDv1.1/SMHD_test.jl/test.jl'
dev_ip_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/SMHDv1.1/SMHD_dev.jl/dev.jl'

sample_input = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/sample1000.jl'
sample_output = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/sample1000text.jl'

# Path to the output JL files
train_op_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/trainText.jl'
test_op_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/testText.jl'
dev_op_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/devText.jl'

 # 1. Data Cleaning

## Keep only text and remove titles 

In [4]:
def keep_only_text(input_jl_file_path,output_jl_file_path) :
    with jsonlines.open(input_jl_file_path) as reader:
        # Create a list to store modified user data
        modified_users = []

        # Iterate through each line in the input JL file
        for user_data in reader:
            # Create a new dictionary for the user data without "text" section in each post
            modified_user_data = {
                "id": user_data["id"],
                "label": user_data["label"],
                "posts": []
            }

            # Iterate through each post in the user's data
            for post in user_data["posts"]:
                modified_post = {
                    "created_utc": post["created_utc"]
                }

                # Check if the post is a comment or an ownpost
                if "text" in post:
                    modified_post["text"] = post["text"]

                modified_user_data["posts"].append(modified_post)

            # Add the modified user data to the list
            modified_users.append(modified_user_data)

    # Write the modified user data to the output JL file
    with jsonlines.open(output_jl_file_path, mode='w') as writer:
        writer.write_all(modified_users)
    
    print("Data copied and modified successfully!")

### (try on sample data) 

In [4]:
keep_only_text(sample_input,sample_output)

Data copied and modified successfully!


### (a) train.jl 

In [14]:
keep_only_text(train_ip_path,train_op_path)

Data copied and modified successfully!


### (b) test.jl 

In [16]:
keep_only_text(test_ip_path,test_op_path)

Data copied and modified successfully!


### (c) dev.jl 

In [17]:
keep_only_text(dev_ip_path,dev_op_path)

Data copied and modified successfully!


## path to new modified files 

In [66]:
# Path to the output JL files
train_ip_text = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/trainText.jl'
test_ip_text = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/testText.jl'
dev_ip_text = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/devText.jl'

sample_file = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/sampleLemma1000.jl'

model_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/Model/Sampleword2vec.model'

## Load all the data to data frames 

#####  sample data 

In [3]:
sample_data = []
with open(sample_file, 'r', encoding='utf-8') as file:
    for line in file:
        sample_data.append(json.loads(line))

##### (a) train.jl 

In [33]:
train_data = []
with open(train_ip_text, 'r', encoding='utf-8') as file:
    for line in file:
        train_data.append(json.loads(line))

##### (b) test.jl 

In [8]:
test_data = []
with open(test_ip_text, 'r', encoding='utf-8') as file:
    for line in file:
        test_data.append(json.loads(line))

##### (c) dev.jl 

In [11]:
val_data = []
with open(dev_ip_text, 'r', encoding='utf-8') as file:
    for line in file:
        val_data.append(json.loads(line))

## Remove punctuations , mentions , hashtags , etc

In [7]:
def clean_text(text):
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Remove non-alphabetical characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)
    # Remove words starting with @ or #
    text = re.sub(r'[@#]\w+', '', text)
    # Remove links (http and https)
    text = re.sub(r'http[s]?://\S+', '', text)
    text = re.sub(r'http\w+', '', text)
    return text

In [8]:
def clean_data(data):
    for user_data in data:
        for post in user_data['posts']:
            if 'text' in post:
                post['text'] = clean_text(post['text'])

### (try on sample data) 

In [9]:
clean_data(sample_data)

In [10]:
sample_data[999]['posts'][5]

{'created_utc': 1390913029,
 'text': 'Unfortunately Charizard is pretty bad wo mega in addition to that you really need a rapid spinner other wise itll be hard to switch him in at all Switching your megastone from aggron to charizard or just ditching charizard would help You have a lot of fast but frail sweepersjolteon espeon greninja Id recommend switching Greninja to a bulky watertype like vaporeon milotic jellicent or if you insist on keeping charizard blastoisetentacruel for rapid spin support Right now your team is incredibly frail on the physical side bar Aggron and none of them can take any powerful special hit'}

### (a) train.jl 

In [36]:
clean_data(train_data)

In [50]:
train_data[0]['posts'][0]

{'created_utc': 1507313988,
 'text': 'Probably weed but maby kratom lsd or shrooms'}

### (b) test.jl 

In [9]:
clean_data(test_data)

In [10]:
test_data[0]['posts'][0]

{'created_utc': 1412904040,
 'text': 'For people who like Sandersons books its great Its right up there with the best hes written in my opinion If any of you havent read anything by Sanderson this is a good place to start '}

### (c) dev.jl 

In [12]:
clean_data(val_data)

In [13]:
val_data[0]['posts'][0]

{'created_utc': 1417511828,
 'text': 'Their posts are always fucking hilarious dunno why people find it so annoying not like youre going to find anything useful in the youtube comments anyway'}

# 2. Tokenization

In [11]:
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [12]:
def tokenize(data):
    for user_data in data:
        for post in user_data['posts']:
            if 'text' in post:
                post['text'] = word_tokenize(post['text'])

### (try on sample data) 

In [13]:
tokenize(sample_data)

In [14]:
sample_data[999]['posts'][4]

{'created_utc': 1389671029,
 'text': ['Added', 'really', 'need', 'that', 'larvesta', 'D']}

### (a) train.jl 

In [None]:
tokenize(train_data)

In [32]:
train_data[0]['posts'][0]

{'created_utc': 1507313988,
 'text': ['Probably', 'weed', 'but', 'maby', 'kratom', 'lsd', 'or', 'shrooms']}

### (b) test.jl 

In [None]:
tokenize(test_data)

### (c) dev.jl 

In [None]:
tokenize(val_data)

# 3. Removal of Stop words 

In [15]:
# Set up NLTK stop words
stop_words = set(stopwords.words('english'))

In [16]:
def handle_stop_words(tokens):
    filtered_tokens = [word for word in tokens if word.lower() not in stop_words]
    return filtered_tokens

In [17]:
def remove_stop_words(tokenized_data):
    for user_data in tokenized_data:
        for post in user_data['posts']:
            post['text'] = handle_stop_words(post['text'])

### (try on sample data) 

##### Before removal of stops 

In [14]:
sample_data[999]['posts'][4]

{'created_utc': 1389671029,
 'text': ['Added', 'really', 'need', 'that', 'larvesta', 'D']}

In [18]:
remove_stop_words(sample_data)

##### After removal of stop words 

In [19]:
sample_data[999]['posts'][4]

{'created_utc': 1389671029, 'text': ['Added', 'really', 'need', 'larvesta']}

### (a) train.jl 

In [13]:
remove_stop_words(train_data)

### (b) test.jl 

In [None]:
remove_stop_words(test_data)

### (c) dev.jl 

In [None]:
remove_stop_words(val_data)

# 4. Text Normalization

## Lemmatization 

In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nikhil\AppData\Roaming\nltk_data...


True

In [39]:
#lemmatizer = WordNetLemmatizer()  #bad results, fast

In [38]:
nlp = spacy.load("en_core_web_sm")  #better results, slow

In [40]:
def normalize_text(text):
    doc = nlp(" ".join(text))
    normalized_tokens = [token.lemma_ for token in doc]
    return normalized_tokens

In [41]:
def normalize(tokenized_data):
    for user_data in tokenized_data:
        for post in user_data['posts']:
            post['text'] = normalize_text(post['text'])

##### Before Lemmatization 

In [36]:
sample_data[999]['posts'][1]

{'created_utc': 1372630153,
 'text': ['really', 'hoping', 'got', 'head', 'option', 'wear']}

In [42]:
normalize(sample_data)

##### After Lemmatization 

In [47]:
sample_data[999]['posts'][1]

{'created_utc': 1372630153,
 'text': ['really', 'hope', 'get', 'head', 'option', 'wear']}

In [49]:
new_file_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/sampleLemma1000.jl'

In [50]:
with open(new_file_path, 'w', encoding='utf-8') as new_file:
    for item in sample_data:
        new_file.write(json.dumps(item, ensure_ascii=False) + '\n')

# 5. Padding Sequences

In [13]:
word_index = {}

In [14]:
current_index = 1

In [15]:
for user_data in sample_data:
    for post in user_data['posts']:
        for word in post['text']:
            if word not in word_index:
                word_index[word] = current_index
                current_index += 1

In [41]:
max_sequence_length = 0

In [42]:
for user_data in sample_data:
    for post in user_data['posts']:
        num_words = len(post['text'])
        if num_words > max_sequence_length:
            max_sequence_length = num_words

In [43]:
print(max_sequence_length)

1490


In [44]:
padded_sequences = []

In [45]:
for user_data in sample_data:
    user_sequences = [post['text'] for post in user_data['posts']]
    
    # Convert text to integer sequences using the word-to-index mapping
    user_sequences = [[word_index.get(word, 0) for word in post] for post in user_sequences]
    
    # Pad integer sequences to the defined maximum length
    padded_user_sequences = pad_sequences(user_sequences, maxlen=max_sequence_length)
    
    padded_sequences.append(padded_user_sequences)

In [46]:
user_index = 999
post_index = 1

In [47]:
user_padded_sequence = padded_sequences[user_index][post_index]

In [48]:
print(f"User {user_index}, Post {post_index} Padded Sequence: {user_padded_sequence}")

User 999, Post 1 Padded Sequence: [   0    0    0 ...  138  349 2150]


In [52]:
file_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/Sample_padding_sequence.json'

In [57]:
padded_sequences = [np_array.tolist() for np_array in padded_sequences]

In [58]:
with open(file_path, 'w') as json_file:
    json.dump(padded_sequences, json_file)

# 6. Word Embeddings

In [61]:
reverse_word_index = {index: word for word, index in word_index.items()}

In [62]:
sentences = []

In [63]:
for user_sequences in padded_sequences:
    user_sentences = []
    for sequence in user_sequences:
        words = [reverse_word_index.get(word_index, '') for word_index in sequence]
        # Filter out empty strings (words that were not in the vocabulary)
        words = [word for word in words if word]
        user_sentences.extend(words)
    sentences.append(user_sentences)

In [64]:
embedding_dim = 100

In [65]:
word2vec_model = Word2Vec(sentences=sentences, vector_size=embedding_dim, window=5, min_count=1, sg=0)

In [67]:
word2vec_model.save(model_path)

In [73]:
word_vector = word2vec_model.wv['kill']

In [78]:
similar_words = word2vec_model.wv.most_similar('kill', topn=5)

In [80]:
print("Word Vector:", word_vector)

Word Vector: [-0.7624603   0.5667327  -0.11790708  0.4045255   1.6707231  -0.93970895
 -1.1593304  -0.34688106  0.9544053  -0.43721533  0.7584189  -0.47916892
 -0.59194493 -0.2209195  -0.06403993 -0.4531901   2.1844497  -1.5598427
 -1.6020402  -1.0974357   1.4175241   0.9730563   0.0496596   0.22842279
 -1.2254587   2.339027   -1.1140791   1.06785    -1.8779273  -0.03106171
 -0.40430018 -2.5592191   1.9469364  -0.97656447 -1.5017256   0.5677096
  1.0405535  -1.5106094  -2.3751135  -1.7575663  -2.8695605  -1.225546
 -1.7039794   0.0600432   0.36928493  0.59420407  0.69251025 -1.2156699
  0.8187086   0.682567    0.9484843  -0.44280767  0.42827117 -0.35138822
  1.1773863   0.7543728  -0.9010412  -0.21494429 -1.5918437  -0.33065182
  1.5333589  -2.2780867  -0.68316287  0.27214384  0.09200618  0.43474734
 -1.8963821  -0.4905794  -0.02900872  1.759124   -1.4844135  -0.13969484
  2.1848798  -1.441671   -0.9977914  -0.55889696 -1.8839258  -1.2839987
 -0.0566964  -2.023728   -0.9764239  -0.5779

In [81]:
print("Similar Words:", similar_words)

Similar Words: [('destroy', 0.7499843835830688), ('attack', 0.7377498745918274), ('chase', 0.7375485301017761), ('strike', 0.737365186214447), ('fight', 0.7349938750267029)]


# 7. Label Encoding

In [109]:
all_labels = set()

In [110]:
for user_data in sample_data:
    user_labels = user_data.get('label', [])  
    # Extend the set with the user's labels
    all_labels.update(user_labels)

In [111]:
all_labels_list = list(all_labels)

In [112]:
print(all_labels_list)

['anxiety', 'autism', 'ptsd', 'adhd', 'bipolar', 'eating', 'depression', 'ocd', 'control']


In [116]:
label_encoder = LabelEncoder()

In [117]:
target_labels = all_labels_list

In [118]:
encoded_labels = label_encoder.fit_transform(target_labels)

In [126]:
encoded_labels[3]

0

In [127]:
# Get the mapping between original labels and their encoded values
label_mapping = dict(zip(target_labels, encoded_labels))

In [128]:
print(label_mapping)

{'anxiety': 1, 'autism': 2, 'ptsd': 8, 'adhd': 0, 'bipolar': 3, 'eating': 6, 'depression': 5, 'ocd': 7, 'control': 4}


In [129]:
file_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/Sample_label_encoding.json'

In [132]:
label_mappings = {label: int(encoded_label) for label, encoded_label in zip(target_labels, label_encoder.transform(target_labels))}

In [133]:
with open(file_path, 'w') as json_file:
    json.dump(label_mappings, json_file)

In [134]:
new_file_path = 'C:/Users/nikhil/PESU/Capstone/Data/SMHD/sampleLabEnc1000.jl'

In [135]:
with open(new_file_path, 'w', encoding='utf-8') as new_file:
    for item in sample_data:
        new_file.write(json.dumps(item, ensure_ascii=False) + '\n')