In [1]:
import numpy as np
import pandas as pd
import spacy
import nltk 
import requests
import json
import googletrans
import random

from random import randint
from nltk.stem import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
from spacy.lang.en import English
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from nltk.corpus import wordnet 
from googletrans import Translator

spacy.load('en_core_web_sm')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

analyser = SentimentIntensityAnalyzer()
parser = English()
porter = PorterStemmer()
translator = Translator()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ninagroot/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
file = 'edit-db_model1.xlsx'
df_gratitude = pd.read_excel(file, 'gratitude')
df_exercises = pd.read_excel('exercises_model1.xlsx')
df_journal = pd.read_excel(file, 'sc_journal')

topics = {
    'social media': ['social', 'media', 'network', 'twitter', 'facebook', 'instagram', 'web', 'linkedin', 'google', 'viral', 'google+', 'event', 'website', 'reach', 'telegraph', 'telephone', 'blog', 'internet', 'social', 'sociality', 'technoself', 'intranet', 'cyber', 'business', 'sociable', 'activist', 'microblogging', 'socially', 'cybernetic', 'pinterest', 'youtube', 'tumblr', 'cyberspace', 'extranet', 'telnet', 'cybernetics', 'modem', 'communication', 'informatics', 'virtual', 'party', 'picture'],
    'body insecurity': ['body', 'insecure', 'belly', 'skin', 'waist', 'hair', 'eyes', 'lips', 'ears', 'forehead', 'eyebrow', 'chin', 'scar', 'nose', 'look', 'eat', 'ate', 'boldness', 'beard', 'smile', 'teeth', 'stomach', 'back', 'ass', 'butt', 'frame', 'feature', 'physique', 'bod', 'bodies', 'torso', 'arm', 'head', 'corpse', 'chest', 'shape', 'shoulder', 'form', 'neck', 'structure', 'leg', 'mass', 'weight', 'shaky', 'unattractive', 'unsure', 'anxious', 'jealous', 'unconfident', 'distrustful', 'uncomfortable', 'unsecured', 'neurotic', 'hopeless', 'depressed', 'introverted', 'helpless', 'fragile', 'danger', 'anxiety', 'mistrust', 'instability', 'unrest', 'hardship', 'vulnerability', 'lawlessness', 'uncertainty', 'turmoil', 'strife', 'fragility', 'hopelessness', 'malnutrition', 'alienation', 'violence', 'poverty', 'tension', 'shortages', 'despair', 'chaos', 'distrust', 'resentment', 'underdevelopment', 'anarchy', 'paranoia', 'cynicism', 'fear', 'scared', 'insecure', 'fearful'],
    'social anxiety': ['fear', 'worry', 'stress', 'nervousness', 'attention', 'shame', 'panic', 'social', 'attack', 'own', 'activities', 'lonely', 'alone', 'rejection', 'public', 'speaking', 'nausea', 'stutter', 'acting', 'performance', 'stage', 'fright', 'fear', 'public', 'shyness', 'blushing', 'anxiety', 'self-consciousness', 'panic', 'illness', 'mood disorder', 'timidness', 'social functioning', 'parties', 'outgoing'],
    'friendship' : ['best', 'friend', 'classmate', 'schoolmate', 'roomie', 'boyfriend', 'sister', 'amigo', 'acquanitance', 'brother', 'comerade', 'girlfriend', 'person', 'pal', 'schoolfriend', 'mate', 'colleague', 'roommate', 'love', 'pal', 'buddy', 'talk', 'soulmate', 'fight', 'trust', 'understanding', 'happiness', 'relations', 'empathy', 'company', 'feelings', 'unity', 'friendship', 'friend', 'companionship', 'relationship', 'affection']
    }

FileNotFoundError: [Errno 2] No such file or directory: 'edit-db_model1.xlsx'

In [None]:
df_journal = df_journal.dropna(subset = ['id'])
df_journal_lang = df_journal
df_journal_lang['description_split'] = df_journal_lang['description'].str.rsplit()
df_journal_lang['length'] = df_journal_lang.description_split.str.len() 
df_journal_lang = df_journal_lang[df_journal_lang['length'] > 20]  
display(df_journal_lang)

In [None]:
display(df_journal)

In [None]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

def lemma1(word):
    return WordNetLemmatizer().lemmatize(word)

def lemma2(word):
    return WordNetLemmatizer().lemmatize(word,'v')

def prepare_text(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 2] #check what works best 
    #tokens = [porter.stem(token) for token in tokens] #stemming
    tokens = [token for token in tokens if token not in en_stop]
    tokens = [lemma1(token) for token in tokens] #lemmatization 
    tokens = [lemma2(token) for token in tokens] #lemmatization verbs
    return tokens

def prepare_topics(topics):
    for topic in topics: 
        prepared_words = [] 
        for string in topics[topic]:
            string = prepare_text(string)
            prepared_words.append(string)
            res = []
            [res.append(x) for x in prepared_words if x not in res] 
            res = [' '.join(map(str, i)) for i in res]
        topics[topic] = res
    return topics 

def sentiment_analyzer(sentence):
    score = analyser.polarity_scores(sentence)
    negative_score = score['neg']
    positive_score = score['pos']
    neutral_score = score['neu']
    difference = abs(negative_score - positive_score)
    if neutral_score > 0.8:
        return 'neutral'
    elif difference <0.1:
        return 'neutral'
    elif negative_score > positive_score:
        return 'negative'
    else:
        return 'positive'

def dutch_translator(text):
    result = translator.translate(text, dest='en',  src='nl')
    return result.text

def eng_translator(text):
    result = translator.translate(text, dest='nl',  src='en')
    return result.text

In [None]:
prepared_topics = prepare_topics(topics)
df_journal['description_eng'] = df_journal.apply(lambda row: dutch_translator(row.description), axis = 1)
df_journal['prepared_description'] = df_journal.apply(lambda row: prepare_text(row.description_eng), axis = 1)
df_journal = df_journal.drop(['timestamp', 'rating', 'sc_description'], axis = 'columns')
display(df_journal)

In [None]:
df_journal_lang['description_eng'] = df_journal_lang.apply(lambda row: dutch_translator(row.description), axis = 1)
df_journal_lang['tag'] = df_journal_lang.apply(lambda row: dutch_translator(row.tag), axis = 1)
df_journal_lang['prepared_description'] = df_journal_lang.apply(lambda row: prepare_text(row.description_eng), axis = 1)
df_journal_lang = df_journal_lang.drop(['timestamp', 'rating', 'sc_description', 'length', 'description_split'], axis = 'columns')
display(df_journal_lang)

In [None]:
df_journal.to_excel('prepared_journal_model1.xlsx')

In [None]:
df_journal_lang.to_excel('prepared_journal_model1_lang.xlsx')

In [None]:
for i in df_journal_lang.prepared_description: 
    print(i)