In [6]:
import pandas as pd
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
from emoji import emoji_count
from sklearn import svm
from sklearn import metrics
import string
import statistics as stat
import nltk
from nltk.corpus import cess_esp
from nltk.corpus import words as english_dict
import re
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV,RandomizedSearchCV,KFold
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from scipy import stats

Remove outliers and perform classifier only with most relevant features (those whose statistical measure is above 0.2)

Features used:
1. Twitter features
2. Stylistic features (each of them by themselves and all together)
3. N-gram features
4. TF + SF
5. TF + N-Gram
6. SF + N-Gram
7. TF + SF + N-Gram

In [7]:
def remove_outliers(data,outliers):
    for column,value in zip(outliers.keys(),outliers.values()):
        outliers_index = np.where(data[column] > value)
        data.drop(outliers_index[0], inplace=True)
        data.reset_index(drop=True,inplace=True)

    return data

In [8]:
def data_scaler(data):
    scaler = StandardScaler()
    x = data.values
    x_scaled = scaler.fit_transform(x)
    df = pd.DataFrame(x_scaled)
    return df

In [80]:
def classifier(classifier, data, features, target, outliers=None, type='binomial', normalize=True, ngram=False):
    aux = features + [target]
    data.drop([f for f in data.columns if f not in aux], axis=1, inplace=True)
    if outliers != None:
        data = remove_outliers(data,outliers)
    
    # obtain target variable
    map_dict = {}
    i = 0
    for t in data[target].unique():
        map_dict[t] = i
        i += 1
    
    Y = list(data[target].map(map_dict))
    data.drop([target], axis=1, inplace=True)

    # get features
    if ngram:
        X = []
        if len(features) > 1:
            if ('wordgram' and 'chargram') in features:
                for w,c in zip(data['wordgram'],data['chargram']):
                    aux = w.tolist() + c.tolist()
                    X.append(aux)
            # for the tests of combination of features
            else:
                aux = []
                for f in data['wordgram']:
                    aux.append(f.tolist())
                data.drop('wordgram',axis=1,inplace=True)
                
                data = data_scaler(data)
                for list1,list2 in zip(aux,data.values.tolist()):
                    X.append(list1+list2)
        else:
            for f in data[features[0]]:
                X.append(f.tolist())

    else:
        data = data_scaler(data)
        X = data.values.tolist()

    # separate in train and test
    X, Y = shuffle(X,Y)
    X_train, X_test, y_train, y_test = train_test_split(X, Y, stratify=Y, test_size=0.3,random_state=109)

    if classifier == 'svm':
        # Linear Kernel
        clf = svm.SVC() 
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)
        
        with open('results.txt','a') as f:
            f.write('\n\t\t* Without tunning')
            f.write(f'\n\t\t\t-> Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
            if classifier == 'binomial':
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred)}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred)}')
            else:
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred,average="weighted")}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred,average="weighted")}')


        # with hyperparameter tunning
        # hyperparameter tuning
        params = "'kernel': 'rbf'"
        best_clf = svm.SVC(kernel='rbf') 
    
    elif classifier == 'xgboost':
        if type == 'binomial':
            objective = 'binary:logistic'
            clf = xgb.XGBRFClassifier(objective=objective)
            params = "{'learning_rate': 0.01, 'n_estimators': 200, 'reg_lambda': 1}"
        else:
            objective = 'multi:softmax'
            clf = xgb.XGBRFClassifier(objective=objective)
            params = "{'learning_rate': 0.01, 'n_estimators': 200, 'reg_lambda': 1}"

        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # with hyperparameter tunning
        # hyperparameter tuning
        best_clf = xgb.XGBRFClassifier(objective=objective,learning_rate=0.01,n_estimators=200,reg_lambda=1)
        
        with open('results.txt','a') as f:
            f.write(f'\n\t\t* Without tunning:')
            f.write(f'\n\t\t\t-> Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
            if classifier == 'binomial':
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred)}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred)}')
            else:
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred,average="weighted")}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred,average="weighted")}')

    

    elif classifier == 'RF':
        rf = RandomForestClassifier()
        rf.fit(X_train,y_train)
        y_pred = rf.predict(X_test)

        params = "{'criterion': 'entropy', 'max_leaf_nodes': None, 'min_samples_leaf': 4, 'min_samples_split': 4, 'n_estimators': 200}"

        best_clf = RandomForestClassifier(criterion='entropy',max_leaf_nodes=None,min_samples_leaf=4,min_samples_split=4,n_estimators=200)
        
        with open('results.txt','a') as f:
            f.write(f'\n\t\t* Without tunning:')
            f.write(f'\n\t\t\t-> Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
            if classifier == 'binomial':
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred)}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred)}')
            else:
                f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred,average="weighted")}')
                f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred,average="weighted")}')

    best_clf.fit(X_train, y_train)
    y_pred = best_clf.predict(X_test)

    with open('results.txt','a') as f:
        f.write(f'\n\t\t* Tunning:{params}')
        f.write(f'\n\t\t\t-> Accuracy: {metrics.accuracy_score(y_test, y_pred)}')
        if classifier == 'binomial':
            f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred)}')
            f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred)}')
        else:
            f.write(f'\n\t\t\t-> Precision: {metrics.precision_score(y_test, y_pred,average="weighted")}')
            f.write(f'\n\t\t\t-> Recall: {metrics.recall_score(y_test, y_pred,average="weighted")}')

# 1. Twitter Features

In [14]:
def twitter_df():    
    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    num_mentions = []
    num_url = []
    num_hashtags = []
    num_emojis = []

    for username in username_list:
        with open(f'Documents/{username}.txt','r') as f:
            text = f.read()
            num_mentions.append(text.count('@'))
            num_url.append(text.count('http://') + text.count('https://'))
            num_hashtags.append(text.count('#'))
            num_emojis.append(emoji_count(text))

    data['mentions'] = num_mentions
    data['url'] = num_url
    data['hashtags'] = num_hashtags
    data['emojis'] = num_emojis

    return data

In [15]:
twitter_df = twitter_df()

### Gender prediction

For gender prediction, the most relevant twitter features are:
- Number of hashtags
- Number of emojis

In [None]:
gender_twitter_df = twitter_df.copy()
outliers = {'hashtags':32,'emojis':100}
classifier('RF',gender_twitter_df,['hashtags','emojis'],'gender',outliers)

### Age prediction
For age prediction all the twitter features are relevant

In [None]:
age_twitter_df = twitter_df.copy()
outliers = {'mentions':170,'hashtags':32,'emojis':100}
classifier('xgboost',age_twitter_df,['mentions','url','hashtags','emojis'],'age',outliers,type='multi')

### Region prediction
For region prediction all the twitter features are relevant

In [None]:
region_twitter_df = twitter_df.copy()
outliers = {'mentions':150,'hashtags':32,'url':75,'emojis':80}
classifier('xgboost',region_twitter_df,['mentions','url','hashtags','emojis'],'region',outliers,type='multi')

# 2. Stylistic Features

## 2.1 Character Based

In [16]:
def df_character_based():    
    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    punctuation_list = list(string.punctuation)

    num_char = []
    num_capital = []
    num_punctuation = []

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            text = f.read()
            num_char.append(len(text))
            num_capital.append(sum(1 for c in text if c.isupper()))
            num_punctuation.append(sum(1 for c in text if c in punctuation_list))

    data['characters'] = num_char
    data['capital_letters'] = num_capital
    data['punctuations'] = num_punctuation

    return data

In [17]:
char_based_df = df_character_based()

### Gender prediction
For gender prediction, the most relevant features are: 
- Number of capital letters
- Number of characters

In [None]:
gender_char_df = char_based_df.copy()
outliers = {'characters':24000,'capital_letters':800}
classifier('xgboost',gender_char_df,['characters','capital_letters'],'gender',outliers)

### Age prediction
For age prediction all features are relevant

In [None]:
age_char_df = char_based_df.copy()
outliers = {'characters':20000,'capital_letters':680,'punctuations':750}
classifier('xgboost',age_char_df,['characters','capital_letters','punctuations'],'age',outliers,type='multi')

### Region prediction
For region prediction all features are relevant

In [None]:
region_char_df = char_based_df.copy()
outliers = {'characters':22000,'capital_letters':680,'punctuations':750}
classifier('xgboost',region_char_df,['characters','capital_letters','punctuations'],'region',outliers,type='multi')

## 2.2 Structural Based

In [18]:
def df_structural_based():    
    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    num_sentence = []
    av_sentence_par = []
    av_words_par = []
    av_char_par = []
    variation = []

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            text = f.read()
            num_par = len(text.split('\n'))

            sentences = text.split('.')
            num_sentence_user = len(sentences)
            num_sentence.append(num_sentence_user)
            av_sentence_par.append(num_sentence_user/num_par)
            
            words = [w for w in text.split(' ') if len(w) > 0]
            num_words = len(words)
            av_words_par.append(num_words/num_par)
            av_char_par.append(len(text)/num_par)

            len_sentence_list = [len(sentence) for sentence in sentences]
            variation.append(stat.variance(len_sentence_list))     

    data['num_sentence'] = num_sentence
    data['av_sentence_par'] = av_sentence_par
    data['av_words_par'] = av_words_par
    data['av_char_par'] = av_char_par
    data['variation'] = variation

    return data

In [19]:
struct_based_df = df_structural_based()

### Gender prediction
For gender prediction, the most relevant features are:
- Sentence count
- Av count of sentence per paragraph
- Av count of words per paragraph

In [None]:
gender_struct_df = struct_based_df.copy()
outliers = {'num_sentence':260,'av_sentence_par':2.2}
classifier('xgboost',gender_struct_df,['num_sentence','av_sentence_par','av_words_par'],'gender',outliers)

### Age prediction
For age prediction, all features are relevant

In [None]:
age_struct_df = struct_based_df.copy()
outliers = {'num_sentence':280,'av_sentence_par':2,'av_words_par':20,'av_char_par':130}
classifier('xgboost',age_struct_df,['num_sentence','av_sentence_par','av_words_par','av_char_par','variation'],'age',outliers,type='multi')

### Region prediction
For region prediction, all features are relevant except average number of words per paragraph

In [None]:
region_struct_df = struct_based_df.copy()
outliers = {'num_sentence':260,'av_sentence_par':1.8,'av_char_par':130}
classifier('xgboost',region_struct_df,['num_sentence','av_sentence_par','av_char_par','variation'],'region',outliers,type='multi')

## 2.3 Syntactic Based

In [2]:
def normalize(s):
    replacements = (
        ("á", "a"),
        ("é", "e"),
        ("í", "i"),
        ("ó", "o"),
        ("ú", "u"),
        ("ü","u"),
        ("ñ","n"),
        ("ç","c"),
        ("\u2026","..."),
    )
    for a, b in replacements:
        s = s.replace(a, b).replace(a.upper(), b.upper())
    return s

In [3]:
# remove accents as tweets have them removed
# PoS Tag
def tagger():
    oraciones = cess_esp.tagged_sents()
    oraciones_sin_acentos = []
    for oracion in oraciones:
        oracion_sin_acento = []
        for palabra,tag in oracion:
            palabra = normalize(palabra)
            oracion_sin_acento.append((palabra,tag))
        oraciones_sin_acentos.append(oracion_sin_acento)

    return nltk.UnigramTagger(oraciones_sin_acentos)

In [53]:
def df_syntactic_based():

    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    pos_tag = tagger()

    num_det = []
    num_pre = []
    num_sing = []
    num_plural = []
    num_adv = []
    num_adj = []
    num_prop = []
    num_pronouns = []
    num_past = []
    num_future = []
    num_conj = []

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            text = f.read()
            # lowercase and remove punctuation marks
            text = text.lower()
            text = text.translate(str.maketrans('','',string.punctuation))
            analysis = pos_tag.tag(text.split(' '))

            det = 0
            pre = 0
            sing = 0
            plural = 0
            adv = 0
            adj = 0
            prop = 0
            pronouns = 0
            past = 0
            future = 0
            conj = 0

            for word,tag in analysis:
                if tag != None:
                    if tag[0] == 'd':
                        det += 1
                    elif tag[0] == 'a':
                        adj += 1
                    elif tag[0] == 'c':
                        conj += 1
                    elif tag[0] == 'p':
                        pronouns += 1
                    elif tag[0] == 'n':
                        if tag[1] == 'p':
                            prop += 1
                        if tag[3] == 's':
                            sing += 1
                        elif tag[3] == 'p':
                            plural += 1
                    elif tag[0] == 'r':
                        adv += 1
                    elif (tag[0] == 'v' and tag[3] == 'f'):
                        future += 1
                    elif (tag[0] == 'v' and tag[3] == 's'):
                        past += 1
                    elif tag[0] == 's':
                        pre += 1

            
            num_det.append(det)
            num_pre.append(pre)
            num_sing.append(sing)
            num_plural.append(plural)
            num_adv.append(adv)
            num_adj.append(adj)
            num_prop.append(prop)
            num_pronouns.append(pronouns)
            num_past.append(past)
            num_future.append(future)
            num_conj.append(conj)

    data['num_det'] = num_det
    data['num_pre'] = num_pre
    data['num_sing'] = num_sing
    data['num_plural'] = num_plural
    data['num_adv'] = num_adv
    data['num_adj'] = num_adj
    data['num_prop'] = num_prop
    data['num_pronouns'] = num_pronouns
    data['num_past'] = num_past
    data['num_future'] = num_future
    data['num_conj'] = num_conj

    return data

In [4]:
# load data for character based features analysis
def df_syntactic_based_ratios():

    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    pos_tag = tagger()

    num_det = []
    num_pre = []
    num_sing = []
    num_plural = []
    num_adv = []
    num_adj = []
    num_prop = []
    num_pronouns = []
    num_past = []
    num_future = []
    num_conj = []

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            text = f.read()
            # lowercase and remove punctuation marks
            text = text.lower()
            text = text.translate(str.maketrans('','',string.punctuation))
            analysis = pos_tag.tag(text.split(' '))

            num_words = len(analysis)

            det = 0
            pre = 0
            sing = 0
            plural = 0
            adv = 0
            adj = 0
            prop = 0
            pronouns = 0
            past = 0
            future = 0
            conj = 0

            for word,tag in analysis:
                if tag != None:
                    if tag[0] == 'd':
                        det += 1
                    elif tag[0] == 'a':
                        adj += 1
                    elif tag[0] == 'c':
                        conj += 1
                    elif tag[0] == 'p':
                        pronouns += 1
                    elif tag[0] == 'n':
                        if tag[1] == 'p':
                            prop += 1
                        if tag[3] == 's':
                            sing += 1
                        elif tag[3] == 'p':
                            plural += 1
                    elif tag[0] == 'r':
                        adv += 1
                    elif (tag[0] == 'v' and tag[3] == 'f'):
                        future += 1
                    elif (tag[0] == 'v' and tag[3] == 's'):
                        past += 1
                    elif tag[0] == 's':
                        pre += 1

            
            num_det.append(det/num_words)
            num_pre.append(pre/num_words)
            num_sing.append(sing/num_words)
            num_plural.append(plural/num_words)
            num_adv.append(adv/num_words)
            num_adj.append(adj/num_words)
            num_prop.append(prop/num_words)
            num_pronouns.append(pronouns/num_words)
            num_past.append(past/num_words)
            num_future.append(future/num_words)
            num_conj.append(conj/num_words)

    data['ratio_det'] = num_det
    data['ratio_pre'] = num_pre
    data['ratio_sing'] = num_sing
    data['ratio_plural'] = num_plural
    data['ratio_adv'] = num_adv
    data['ratio_adj'] = num_adj
    data['ratio_prop'] = num_prop
    data['ratio_pronouns'] = num_pronouns
    data['ratio_past'] = num_past
    data['ratio_future'] = num_future
    data['ratio_conj'] = num_conj

    return data

In [54]:
synt_based_df = df_syntactic_based()

In [10]:
synt_based_ratio_df = df_syntactic_based_ratios()
synt_based_ratio_df.head()

Unnamed: 0,username,gender,age,region,ratio_det,ratio_pre,ratio_sing,ratio_plural,ratio_adv,ratio_adj,ratio_prop,ratio_pronouns,ratio_past,ratio_future,ratio_conj
0,lozanogarcia68,female,55+,Madrid,0.12297,0.12761,0.12065,0.032483,0.060325,0.058005,0.0,0.066125,0.00348,0.00232,0.035963
1,beltrangmodet,male,18-24,Madrid,0.095745,0.170213,0.021277,0.021277,0.06383,0.021277,0.0,0.117021,0.0,0.0,0.031915
2,edubellver,male,18-24,Madrid,0.133858,0.141732,0.094488,0.031496,0.015748,0.03937,0.0,0.07874,0.0,0.0,0.031496
3,luss_27,female,18-24,Madrid,0.134884,0.098605,0.092093,0.026977,0.073488,0.031628,0.0,0.109767,0.002791,0.00186,0.044651
4,k15ce,male,25-34,Madrid,0.15857,0.119157,0.121907,0.027498,0.055912,0.042163,0.0,0.082493,0.010999,0.000917,0.049496


### Gender prediction
For gender prediction, the most relevant features are:
- Num of determiners
- Num of prepositions
- Num of singular nouns
- Num of adjetives
- Num of future tense verbs
- Num of conjunctions

In [None]:
gender_synt_df = synt_based_df.copy()
outliers = {'num_det':450,'num_pre':450,'num_sing':400,'num_adj':150,'num_future':8,'num_conj':150}
classifier('xgboost',gender_synt_df,['num_det','num_pre','num_sing','num_adj','num_future','num_conj'],'gender',outliers)

In [71]:
gender_synt_df = synt_based_ratio_df.copy()
outliers = None
classifier('xgboost',gender_synt_df,['ratio_det','ratio_pre','ratio_sing','ratio_adj','ratio_pronouns','ratio_future'],'gender',outliers)

### Age prediction
For age prediction, the most relevant features are:
- Num of plural nouns
- Num of adverbs
- Num of singular nouns
- Num of past tense verbs
- Num of future tense verbs
- Num of prepositions

In [None]:
age_synt_df = synt_based_df.copy()
outliers = {'num_plural':90,'num_adv':120,'num_sing':380,'num_past':17,'num_future':8,'num_pre':450}
classifier('xgboost',age_synt_df,['num_plural','num_adv','num_sing','num_past','num_future','num_pre'],'age',outliers,type='multi')

In [72]:
age_synt_df = synt_based_ratio_df.copy()
outliers = None
classifier('xgboost',age_synt_df,['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future'],'age',outliers,type='multi')

### Region prediction
For region prediction, the most relevant features are:
- Num of determiners
- Num of prepositions
- Num of adverbs
- Num of past tense verbs
- Num of future tense verbs
- Num of conjunctions

In [None]:
region_synt_df = synt_based_df.copy()
outliers = {'num_det':450,'num_pre':470,'num_past':18,'num_future':8,'num_conj':140}
classifier('xgboost',region_synt_df,['num_det','num_pre','num_adv','num_past','num_future','num_conj'],'region',outliers,type='multi')

In [73]:
region_synt_df = synt_based_ratio_df.copy()
outliers = None
classifier('xgboost',region_synt_df,['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future'],'region',outliers,type='multi')

## 2.4 Word Based

In [22]:
sent_analysis_data = pd.read_csv('../Spanish-NRC-EmoLex.txt',sep='\t')
spanish_dict = [normalize(w) for w in list(sent_analysis_data['Spanish Word'])]
columns = sent_analysis_data.keys()
negative_cols = ['negative','fear','anger','disgust','sadness']
positive_cols = ['positive','joy','trust']

negative_words = []
for col in negative_cols:
    i = 0
    for val in sent_analysis_data[col]:
        if val == 1:
            negative_words.append(spanish_dict[i])
        i += 1

positive_words = []
for col in positive_cols:
    i = 0
    for val in sent_analysis_data[col]:
        if val == 1:
            positive_words.append(spanish_dict[i])
        i += 1

negative_words = set(negative_words)
positive_words = set(positive_words)

In [21]:
def sent_analysis(words):

    intersection_neg = list(negative_words & set(words))
    intersection_pos = list(positive_words & set(words))

    return len(intersection_pos),len(intersection_neg)

In [23]:
# returns a list of words that occur exactly 'num' times or None if no coincidence
def num_occurences(lista,num):
    aux_dict = {}
    for item in lista:
        if item in aux_dict.keys():
            aux_dict[item] += 1
        else:
            aux_dict[item] = 1

    try:
        idx = list(aux_dict.values()).index(num)
        words = list(aux_dict.keys())[idx]
        return len(words)
    except:
        return 0

In [26]:
# load data for character based features analysis
def df_word_based():

    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    # load spanish stop words
    stop_words_df = pd.read_csv('../spanish-stop-words.txt',header=None)
    stop_words = [normalize(w) for w in list(stop_words_df[0])]

    num_words = []
    num_pos_words = []
    num_neg_words = []
    num_unique = []
    num_twice = []
    av_length = []
    max_length = []
    num_numbers = []
    num_greater = []
    num_smaller = []
    num_stop = []

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            text = f.read()
            # remove punctuation
            text = text.translate(str.maketrans('','',string.punctuation))
            sentences = text.split('\n')
            words = []
            for sentence in sentences:
                words += sentence.split(' ')
            words = [w for w in words if len(w) != 0]
            words_lower = [w.lower() for w in words if len(w) != 0]

            num_words.append(len(words))

            pos, neg = sent_analysis(words_lower)

            num_pos_words.append(pos)
            num_neg_words.append(neg)

            # unique words
            num_unique.append(num_occurences(words,1))
            # twice occurrences
            num_twice.append(num_occurences(words,2))
            
            # max, av, >6, <3 length and num words with digits, count english words
            max_len = 0
            sum_length = 0
            digits = 0
            len_greater = 0
            len_smaller = 0
            for word in words:
                sum_length += len(word)
                if len(word) > max_len:
                    max_len = len(word)

                if len(re.findall('\d',word)) > 0:
                    digits += 1

                if len(word) > 6:
                    len_greater += 1
                elif len(word) < 3:
                    len_smaller += 1
            
            av_length.append(sum_length/len(words))
            max_length.append(max_len)
            num_numbers.append(digits)
            num_greater.append(len_greater)
            num_smaller.append(len_smaller)

            # count stop-words 
            intersection_stop = list(set(stop_words) & set(words_lower))
            num_stop.append(len(intersection_stop))


    data['num_words'] = num_words
    data['num_pos_words'] = num_pos_words
    data['num_neg_words'] = num_neg_words
    data['num_unique'] = num_unique
    data['num_twice'] = num_twice
    data['av_length'] = av_length
    data['max_length'] = max_length
    data['num_numbers'] = num_numbers
    data['num_greater'] = num_greater
    data['num_smaller'] = num_smaller
    data['num_stop'] = num_stop

    return data         

In [27]:
word_based_df = df_word_based()

### Gender prediction
For gender prediction, the most relevant features are:
- Num of words
- Num of negative words
- Average word length
- Max word length
- Num of words with numbers
- Num of words of length smaller than 3
- Num of stop-words

In [None]:
gender_word_df = word_based_df.copy()
outliers = {'num_words':3200,'num_neg_words':110,'av_length':6,'max_length':52,'num_numbers':50,'num_smaller':900}
classifier('xgboost',gender_word_df,['num_words','num_neg_words','av_length','max_length','num_numbers','num_smaller','num_stop'],'gender',outliers)

### Age prediction
For age prediction, the most relevant features are all except 'num of unique words' and 'number of words that occur twice'


In [None]:
age_word_df = word_based_df.copy()
outliers = {'num_words':3500,'num_pos_words':120,'num_neg_words':110,'av_length':6,'max_length':52,'num_numbers':50,'num_greater':900,'num_smaller':900}
classifier('xgboost',age_word_df,['num_words','num_pos_words','num_neg_words','av_length','max_length','num_numbers','num_greater','num_smaller','num_stop'],'age',outliers,type='multi')

### Region prediction
For region prediction, the most relevant features are all except 'number of positive words' and 'number of words that occur twice'

In [None]:
region_word_df = word_based_df.copy()
outliers = {'num_words':3500,'num_neg_words':110,'num_unique':10,'av_length':6,'max_length':50,'num_numbers':50,'num_greater':900,'num_smaller':900}
classifier('xgboost',region_word_df,['num_words','num_neg_words','num_unique','av_length','max_length','num_numbers','num_greater','num_smaller','num_stop'],'region',outliers,type='multi')

## 2.5 All stylistic features

In [28]:
char_df = char_based_df.copy()
struct_df = struct_based_df.copy()
synt_df = synt_based_ratio_df.copy()
word_df = word_based_df.copy()

stylistic_df = pd.concat([char_df,struct_df,synt_df,word_df],axis=1)
stylistic_df = stylistic_df.iloc[:,~stylistic_df.columns.duplicated()]

### Gender prediction

In [29]:
gender_df = stylistic_df.copy()
gender_df.head()

Unnamed: 0,username,gender,age,region,characters,capital_letters,punctuations,num_sentence,av_sentence_par,av_words_par,...,num_pos_words,num_neg_words,num_unique,num_twice,av_length,max_length,num_numbers,num_greater,num_smaller,num_stop
0,lozanogarcia68,female,55+,Madrid,5731,233,390,208,4.16,17.22,...,44,31,6,4,5.104839,21,13,237,241,131
1,beltrangmodet,male,18-24,Madrid,554,38,16,12,1.2,9.4,...,6,1,6,4,4.316832,15,1,17,24,32
2,edubellver,male,18-24,Madrid,869,17,42,23,1.0,5.521739,...,11,10,2,4,4.557047,14,0,29,28,46
3,luss_27,female,18-24,Madrid,7044,362,437,175,1.535088,9.429825,...,41,33,5,3,4.766931,44,2,228,332,148
4,k15ce,male,25-34,Madrid,7117,299,333,224,2.055046,10.0,...,50,34,4,5,4.737913,45,8,263,366,151


In [74]:
gender_df = stylistic_df.copy()

outliers = {'characters':24000,
            'capital_letters':800,
            'num_sentence':260,
            'av_sentence_par':2.2,
            'num_words':3200,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_adj','ratio_pronouns','ratio_future']

classifier('xgboost',gender_df,features,'gender',outliers)

### Age prediction

In [75]:
age_df = stylistic_df.copy()

outliers = {'characters':20000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':280,
            'av_sentence_par':2,
            'av_words_par':20,
            'av_char_par':130,
            'num_words':3500,
            'num_pos_words':120,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',age_df,features,'age',outliers,type='mult')

### Region prediction

In [76]:
region_df = stylistic_df.copy()

outliers = {'characters':22000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':260,
            'av_sentence_par':1.8,
            'av_char_par':130,
            'num_words':3500,
            'num_neg_words':110,
            'num_unique':10,
            'av_length':6,
            'max_length':50,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',region_df,features,'region',outliers,type='mult')

# 3. N-Grams

In [43]:
# load spanish stop words and remove accents (tweets dont have accents)
stop_words_df = pd.read_csv('../spanish-stop-words.txt',header=None)
stop_words = [normalize(w) for w in list(stop_words_df[0])] + ['q','ma']

In [44]:
def ngram_df():    
    # create df
    data = pd.read_excel('cleaned_users.xlsx')
    username_list = data['username']

    texts = []
    wordgram = []
    chargram = []
    char_vectorizer = TfidfVectorizer(analyzer='char',ngram_range=(2,7))
    word_vectorizer = TfidfVectorizer(stop_words=stop_words,ngram_range=(1,2))

    for username in username_list:
        with open(f'Cleaned Documents/{username}.txt','r') as f:
            document = f.read()
            texts.append(document)

            tweets = document.split('\n')
            # only consider users with 90 or more tweets
            if len(tweets) >= 90:
                word_tfidf_matrix = word_vectorizer.fit_transform(tweets)
                char_tfidf_matrix = char_vectorizer.fit_transform(tweets)
                # normalize matrix to perform PCA
                matrix_word = word_tfidf_matrix.todense()
                word_matrix_df = pd.DataFrame(matrix_word)
                word_matrix_df = word_matrix_df - word_matrix_df.mean()

                matrix_char = char_tfidf_matrix.todense()
                char_matrix_df = pd.DataFrame(matrix_char)
                char_matrix_df = char_matrix_df - char_matrix_df.mean()
                # perform PCA to reduce dimensionality (vector of dim=85 for each tweet)
                pca = PCA(n_components=90)

                final_word_matrix = pd.DataFrame(pca.fit_transform(word_matrix_df))
                final_char_matrix = pd.DataFrame(pca.fit_transform(char_matrix_df))
                # obtain author's vector by averaging the vectors of their tweets
                author_word_vector = np.array(final_word_matrix.mean(axis=0))
                author_char_vector = np.array(final_char_matrix.mean(axis=0))
                
                wordgram.append(author_word_vector)
                chargram.append(author_char_vector)

            else:
                wordgram.append(None)
                chargram.append(None)

    data['text'] = texts
    data['wordgram'] = wordgram
    data['chargram'] = chargram

    return data
        

In [45]:
ngram_df = ngram_df()

## 3.1 Word Gram 

### Gender prediction

In [None]:
gender_wordgram_df = ngram_df.copy()
gender_wordgram_df.dropna(inplace=True)

classifier('xgboost',gender_wordgram_df,['wordgram'],'gender',normalize=False,ngram=True)

### Age prediction

In [134]:
age_wordgram_df = ngram_df.copy()
age_wordgram_df.dropna(inplace=True)

classifier('xgboost',age_wordgram_df,['wordgram'],'age',type='multi',normalize=False,ngram=True)

### Region prediction

In [135]:
region_wordgram_df = ngram_df.copy()
region_wordgram_df.dropna(inplace=True)

classifier('xgboost',region_wordgram_df,['wordgram'],'region',type='multi',normalize=False,ngram=True)

## 3.2 Character Gram

### Gender prediction

In [136]:
gender_chargram_df = ngram_df.copy()
gender_chargram_df.dropna(inplace=True)

classifier('xgboost',gender_chargram_df,['chargram'],'gender',normalize=False,ngram=True)

### Age prediction

In [137]:
age_chargram_df = ngram_df.copy()
age_chargram_df.dropna(inplace=True)

classifier('xgboost',age_chargram_df,['chargram'],'age',type='multi',normalize=False,ngram=True)

### Region prediction

In [138]:
region_chargram_df = ngram_df.copy()
region_chargram_df.dropna(inplace=True)

classifier('xgboost',region_chargram_df,['chargram'],'region',type='multi',normalize=False,ngram=True)

## 3.3 Both

### Gender prediction

In [141]:
gender_ngram_df = ngram_df.copy()
gender_ngram_df.dropna(inplace=True)

classifier('xgboost',gender_ngram_df,['wordgram','chargram'],'gender',normalize=False,ngram=True)

### Age prediction

In [142]:
age_ngram_df = ngram_df.copy()
age_ngram_df.dropna(inplace=True)

classifier('xgboost',age_ngram_df,['wordgram','chargram'],'age',type='multi',normalize=False,ngram=True)

### Region prediction

In [143]:
region_ngram_df = ngram_df.copy()
region_ngram_df.dropna(inplace=True)

classifier('xgboost',region_ngram_df,['wordgram','chargram'],'region',type='multi',normalize=False,ngram=True)

# 4. TF + SF (ALL)

In [37]:
sf_df = stylistic_df.copy()
tf_df = twitter_df.copy()

combi1_df = pd.concat([sf_df,tf_df],axis=1)
combi1_df = combi1_df.iloc[:,~combi1_df.columns.duplicated()]

### Gender prediction

In [77]:
gender_combi1_df = combi1_df.copy()

outliers = {'hashtags':32,
            'emojis':100,
            'characters':24000,
            'capital_letters':800,
            'num_sentence':260,
            'av_sentence_par':2.2,
            'num_words':3200,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_adj','ratio_pronouns','ratio_future']

classifier('xgboost',gender_combi1_df,features,'gender',outliers)

### Age prediction

In [78]:
age_combi1_df = combi1_df.copy()

outliers = {'mentions':170,
            'hashtags':32,
            'emojis':100,
            'characters':20000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':280,
            'av_sentence_par':2,
            'av_words_par':20,
            'av_char_par':130,
            'num_words':3500,
            'num_pos_words':120,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',age_combi1_df,features,'age',outliers,type='mult')

### Region prediction

In [79]:
region_combi1_df = combi1_df.copy()

outliers = {'mentions':150,
            'hashtags':32,
            'url':75,
            'emojis':80,
            'characters':22000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':260,
            'av_sentence_par':1.8,
            'av_char_par':130,
            'num_words':3500,
            'num_neg_words':110,
            'num_unique':10,
            'av_length':6,
            'max_length':50,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']
classifier('xgboost',region_combi1_df,features,'region',outliers,type='mult')

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# 5. TF + N-Gram 

In [87]:
ng_df = ngram_df.copy()
tf_df = twitter_df.copy()

combi2_df = pd.concat([ng_df,tf_df],axis=1)
combi2_df = combi2_df.iloc[:,~combi2_df.columns.duplicated()]

### Gender prediction

In [147]:
gender_combi2_df = combi2_df.copy()
gender_combi2_df.dropna(inplace=True)
gender_combi2_df.reset_index(drop=True,inplace=True)

outliers = {'hashtags':32,
            'emojis':100
            }

features = list(outliers.keys()) + ['wordgram'] 

classifier('xgboost',gender_combi2_df,features,'gender',outliers,ngram=True)

### Age prediction

In [148]:
age_combi2_df = combi2_df.copy()
age_combi2_df.dropna(inplace=True)
age_combi2_df.reset_index(drop=True,inplace=True)

outliers = {'mentions':170,
            'hashtags':32,
            'emojis':100
            }

features = list(outliers.keys()) + ['wordgram'] 

classifier('xgboost',age_combi2_df,features,'age',outliers,type='multi',ngram=True)

### Region prediction

In [149]:
region_combi2_df = combi2_df.copy()
region_combi2_df.dropna(inplace=True)
region_combi2_df.reset_index(drop=True,inplace=True)

outliers = {'mentions':150,
            'hashtags':32,
            'url':75,
            'emojis':80
            }

features = list(outliers.keys()) + ['wordgram'] 

classifier('xgboost',region_combi2_df,features,'region',outliers,type='multi',ngram=True)

# 6. SF + N-Gram

In [46]:
ng_df = ngram_df.copy()
sf_df = stylistic_df.copy()

combi3_df = pd.concat([ng_df,sf_df],axis=1)
combi3_df = combi3_df.iloc[:,~combi3_df.columns.duplicated()]

### Gender prediction

In [81]:
gender_combi3_df = combi3_df.copy()
gender_combi3_df.dropna(inplace=True)
gender_combi3_df.reset_index(drop=True,inplace=True)

outliers = {'characters':24000,
            'capital_letters':800,
            'num_sentence':260,
            'av_sentence_par':2.2,
            'num_words':3200,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_adj','ratio_pronouns','ratio_future']

classifier('xgboost',gender_combi3_df,features,'gender',outliers,ngram=True)

### Age prediction

In [82]:
age_combi3_df = combi3_df.copy()
age_combi3_df.dropna(inplace=True)
age_combi3_df.reset_index(drop=True,inplace=True)

outliers = {'characters':20000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':280,
            'av_sentence_par':2,
            'av_words_par':20,
            'av_char_par':130,
            'num_words':3500,
            'num_pos_words':120,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',age_combi3_df,features,'age',outliers,type='mult',ngram=True)

### Region prediction

In [83]:
region_combi3_df = combi3_df.copy()
region_combi3_df.dropna(inplace=True)
region_combi3_df.reset_index(drop=True,inplace=True)

outliers = {'characters':22000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':260,
            'av_sentence_par':1.8,
            'av_char_par':130,
            'num_words':3500,
            'num_neg_words':110,
            'num_unique':10,
            'av_length':6,
            'max_length':50,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',region_combi3_df,features,'region',outliers,type='mult',ngram=True)

  _warn_prf(average, modifier, msg_start, len(result))


# 7. TF + SF + N-Gram

In [51]:
ng_df = ngram_df.copy()
sf_df = stylistic_df.copy()
tf_df = twitter_df.copy()

combi4_df = pd.concat([ng_df,sf_df,tf_df],axis=1)
combi4_df = combi4_df.iloc[:,~combi4_df.columns.duplicated()]

### Gender prediction

In [84]:
gender_combi4_df = combi4_df.copy()
gender_combi4_df.dropna(inplace=True)
gender_combi4_df.reset_index(drop=True,inplace=True)

outliers = {'hashtags':32,
            'emojis':100,
            'characters':24000,
            'capital_letters':800,
            'num_sentence':260,
            'av_sentence_par':2.2,
            'num_words':3200,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_adj','ratio_pronouns','ratio_future']

classifier('xgboost',gender_combi4_df,features,'gender',outliers,ngram=True)

### Age prediction

In [85]:
age_combi4_df = combi4_df.copy()
age_combi4_df.dropna(inplace=True)
age_combi4_df.reset_index(drop=True,inplace=True)

outliers = {'mentions':170,
            'hashtags':32,
            'emojis':100,
            'characters':20000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':280,
            'av_sentence_par':2,
            'av_words_par':20,
            'av_char_par':130,
            'num_words':3500,
            'num_pos_words':120,
            'num_neg_words':110,
            'av_length':6,
            'max_length':52,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',age_combi4_df,features,'age',outliers,type='mult',ngram=True)

### Region prediction

In [86]:
region_combi4_df = combi4_df.copy()
region_combi4_df.dropna(inplace=True)
region_combi4_df.reset_index(drop=True,inplace=True)

outliers = {'mentions':150,
            'hashtags':32,
            'url':75,
            'emojis':80,
            'characters':22000,
            'capital_letters':680,
            'punctuations':750,
            'num_sentence':260,
            'av_sentence_par':1.8,
            'av_char_par':130,
            'num_words':3500,
            'num_neg_words':110,
            'num_unique':10,
            'av_length':6,
            'max_length':50,
            'num_numbers':50,
            'num_greater':900,
            'num_smaller':900
            }

features = list(outliers.keys()) + ['wordgram'] + ['ratio_det','ratio_pre','ratio_sing','ratio_plural','ratio_adv','ratio_adj','ratio_pronouns','ratio_past','ratio_future']

classifier('xgboost',region_combi4_df,features,'region',outliers,type='multi',ngram=True)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
