In [9]:
import twarc
import os
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
import requests
from transformers import pipeline
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation

nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.append('uh')
puncs = string.punctuation

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/korfoo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Process the downloaded tweet json to csv files

In [2]:
all_columns=['conversation_id', 'id','author_id','text','in_reply_to_user_id']
for file in os.listdir("Airlines_conv"):
    if file.split(".")[1]=="json":
        print("--------------Processing file {0}--------------".format(file))
        df=pd.read_json(os.path.join("Airlines_conv",file),lines=True)
        all_tweets=[]
        i=100
        for line in df['data']:
            for tweet in line:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet[column])

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')

                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        for line in df['includes']:
            for tweet in line['tweets']:
                tweet_info=[]

                for column in all_columns:
                    tweet_info.append(tweet.get(column,''))

                if tweet.get('referenced_tweets','')!='':
                    if tweet['referenced_tweets'][0]['type']=='replied_to':
                        tweet_info.append(tweet['referenced_tweets'][0]['id'])
                    else:
                        tweet_info.append('')
                else:
                    tweet_info.append('')
                
                if tweet.get('attachments','')!='':
                    tweet_info.append('1')
                else:
                    tweet_info.append('0')
                
                all_tweets.append(tweet_info)

            if len(all_tweets)>i:
                print("--------------Processed more than {0} tweets--------------".format(i))
                i=i*10

        df_out = pd.DataFrame(all_tweets,columns=all_columns+['replied_to', 'attachment'])
        df_out = df_out.drop_duplicates().copy()
        df_out.to_csv("Airlines_conv/"+file.split(".")[0]+".csv", index=False)

--------------Processing file SAS.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file lufthansa.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file Ryanair.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processing file FlySWISS.json--------------
--------------Processed more than 100 tweets--------------
--------------Processed more than 1000 tweets--------------
--------------Processed more than 10000 tweets--------------
--------------Processed more than 100000 tweets------

## Process the downloaded tweet json to csv files

In [3]:
#Dictionary to store Airlines ids
Airlines_dict = {
        "SAS":"1379801",
        "lufthansa":"124476322",
        "Ryanair":"1542862735",
        "FlySWISS":"21764143",
        "Finnair":"16694416",
}

In [4]:
def do_labeling(org_ids):

    #DataFrame to store all labeled replies related to our task
    all_replies = pd.DataFrame()

    for file in os.listdir('Airlines_conv'):
        if file.split('.')[1]=='csv':
            NGO_name = file.split('.')[0]
            NGO_id = org_ids[NGO_name]

            #added terminator to fix tokenizer error for some files
            df = pd.read_csv("Airlines_conv/{0}".format(file), lineterminator='\n', dtype='str')

            #get the tweets on which users got replies
            ngo_replied_to_ids = df[(df["author_id"] == NGO_id) & (df["in_reply_to_user_id"] != NGO_id)]["replied_to"].dropna()
            ngo_replied_to = df[df['id'].isin(ngo_replied_to_ids)].dropna()
            ngo_replied_to["label"] = 1

            #get the tweets on which ngo didn't reply
            replies_to_ngo = df[(df["in_reply_to_user_id"]==NGO_id) & (df["author_id"]!=NGO_id)]
            replies_to_ngo = replies_to_ngo[~replies_to_ngo['id'].isin(ngo_replied_to_ids)].dropna()
            replies_to_ngo["label"] = 0

            #concatenate these tweets together
            all_ngo_replies = pd.concat([replies_to_ngo, ngo_replied_to]).reset_index(drop=True)
            all_ngo_replies["relatedOrg"] = NGO_name
            
            #add current ngo data to global df
            all_replies = pd.concat([all_replies, all_ngo_replies]).reset_index(drop=True)

            print("processed file: {0}".format(file))
            
    all_replies.to_csv("Airlines_conv/labeled_data.csv")


In [5]:
#label airlines
do_labeling(Airlines_dict)

processed file: lufthansa.csv
processed file: Ryanair.csv
processed file: Finnair.csv
processed file: FlySWISS.csv
processed file: SAS.csv


In [15]:
df = pd.read_csv("Airlines_conv/labeled_data.csv", index_col=0, dtype='str')
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa


In [7]:
#check statistics
df.groupby([df.label, df.relatedOrg]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment
label,relatedNGO,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,Finnair,2542,2542,2542,2542,2542,2542,2542
0,FlySWISS,1467,1467,1467,1467,1467,1467,1467
0,Ryanair,4912,4912,4912,4912,4912,4912,4912
0,SAS,1760,1760,1760,1760,1760,1760,1760
0,lufthansa,2382,2382,2382,2382,2382,2382,2382
1,Finnair,615,615,615,615,615,615,615
1,FlySWISS,1192,1192,1192,1192,1192,1192,1192
1,Ryanair,1403,1403,1403,1403,1403,1403,1403
1,SAS,1025,1025,1025,1025,1025,1025,1025
1,lufthansa,1350,1350,1350,1350,1350,1350,1350


## Process downloaded authors info

In [10]:
#save user ids to download through twarc
users = pd.DataFrame(df["author_id"].unique())
users.to_csv("Airlines_conv/users_ids.txt", index = False, header=False)

In [11]:
#Process the needed info to csv
df=pd.read_json("Airlines_conv/users.json",lines=True)
all_users=[]
public_columns=['followers_count', 'following_count','tweet_count']
additional_columns=['id','name', 'username', 'description','created_at']
for line in df['data']:
    for user in line:

        user_info=[]        
        for column in public_columns:
            try:
                user_info.append(user['public_metrics'][column])
            except:
                print(user)
        for column in additional_columns:
            user_info.append(user[column])

        all_users.append(user_info)

users = pd.DataFrame(all_users,columns=public_columns+additional_columns)
users = users.drop_duplicates().copy()
users = users.drop_duplicates(["id"]).copy()
users.to_csv("Airlines_conv/users.csv", index=False)

In [12]:
#read df
df = pd.read_csv("Airlines_conv/labeled_data.csv", index_col=0, dtype='str')
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa
...,...,...,...,...,...,...,...,...,...
18645,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS
18646,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS
18647,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS
18648,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS


In [16]:
#merge with existing tweet df
df_merged = df.merge(users, left_on='author_id', right_on='id', how='left')
df_merged = df_merged.drop(['name', 'description', 'created_at', 'id_y', 'username'], axis = 1)
df_merged = df_merged.rename(columns={"id_x":"id", "followers_count":"author_followers", "following_count":"author_following", "tweet_count":"author_tweets"})
df_merged

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,387.0,123495.0
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,3176.0,29161.0
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,592.0,2885.0
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,685.0,20518.0
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,8.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...
18647,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS,3241.0,546.0,65988.0
18648,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS,587.0,74.0,6326.0
18649,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS,7.0,76.0,398.0
18650,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS,1953.0,162.0,3710.0


## Process downloaded info of users in reply to whom the tweet was written

In [21]:
#get the user id whom the tweet was addressed to download data from twarc
replied_users = pd.DataFrame(df["in_reply_to_user_id"].unique())
replied_users.to_csv("Airlines_conv/replied_users_ids.txt", index = False, header=False)

In [22]:
#Process the needed info to csv
df=pd.read_json("Airlines_conv/replied_users.json",lines=True)
all_users=[]
public_columns=['followers_count', 'following_count','tweet_count']
additional_columns=['id','name', 'username', 'description','created_at']
for line in df['data']:
    for user in line:

        user_info=[]        
        for column in public_columns:
            try:
                user_info.append(user['public_metrics'][column])
            except:
                print(user)
        for column in additional_columns:
            user_info.append(user[column])

        all_users.append(user_info)

replied_users = pd.DataFrame(all_users,columns=public_columns+additional_columns)
replied_users = replied_users.drop_duplicates().copy()
replied_users = replied_users.drop_duplicates(["id"]).copy()
replied_users.to_csv("Airlines_conv/replied_users.csv", index=False)

In [24]:
#merge with existing tweet df
df_merged = df_merged.merge(replied_users, left_on='in_reply_to_user_id', right_on='id', how='left')
df_merged = df_merged.drop(['name', 'description', 'created_at', 'id_y'], axis = 1)
df_merged = df_merged.rename(columns={"id_x":"id", "followers_count":"replied_followers", "following_count":"replied_following", "tweet_count":"replied_tweets", "username":"replied_name"})
df_merged

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets,replied_followers,replied_following,replied_tweets,replied_name
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,387.0,123495.0,534926.0,34.0,157070.0,lufthansa
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,3176.0,29161.0,534926.0,34.0,157070.0,lufthansa
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,592.0,2885.0,534926.0,34.0,157070.0,lufthansa
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,685.0,20518.0,534926.0,34.0,157070.0,lufthansa
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,8.0,2.0,534926.0,34.0,157070.0,lufthansa
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18645,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS,3241.0,546.0,65988.0,128064.0,117.0,60424.0,SAS
18646,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS,587.0,74.0,6326.0,128064.0,117.0,60424.0,SAS
18647,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS,7.0,76.0,398.0,128064.0,117.0,60424.0,SAS
18648,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS,1953.0,162.0,3710.0,128064.0,117.0,60424.0,SAS


In [25]:
#save the data
df_merged.to_csv("Airlines_conv/labeled_with_users.csv")

## Add additional features

In [29]:
df = pd.read_csv("Airlines_conv/labeled_with_users.csv", dtype='str', index_col= 0)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,author_following,author_tweets,replied_followers,replied_following,replied_tweets,replied_name
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,387.0,123495.0,534926.0,34.0,157070.0,lufthansa
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,3176.0,29161.0,534926.0,34.0,157070.0,lufthansa
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,592.0,2885.0,534926.0,34.0,157070.0,lufthansa
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,685.0,20518.0,534926.0,34.0,157070.0,lufthansa
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,8.0,2.0,534926.0,34.0,157070.0,lufthansa


In [None]:
#drop rows with uncorrectly processed label if such exist
df = df[df['label'].notna()]
df = df.reset_index(drop=True)

In [32]:
#tokenize the text
df["text_new"]=df["text"].apply(lambda x: [word for word in [re.sub(f"[{puncs}]","",token).lower() for token in x.split() if token[0]!='@'] if word not in stop_words+['rt','']])
#number of user mentions "@" in a text
df["num_mentions"]=df["text"].apply(lambda x: len([i for i in x.split() if i[0]=='@']))
#number of links in a text
df["num_links"]=df["text"].apply(lambda x: len([i for i in x.split() if 'http'in i]))
#number of words in original text
df["num_full_words"] = df.text.apply(lambda x: len(x.split(' ')))
#number of words in tokenized text
df["num_tokenized_words"] = df.text_new.apply(lambda x: len(x))
#number of hashtags in text
df["num_hashtags"]=df.text.apply(lambda x: len([i for i in x.split() if i[0]=='#']))
#number of related_NGO mentions in a tweet
df["num_NGO_mentions"]=df.apply(lambda x: x.text.split(" ").count(f"@{x.relatedNGO}"), axis=1)
#number of exclamation marks in text
df["num_exclamation"]=df.apply(lambda x: x.text.count("!"), axis=1)
#number of question marks in a tweet
df["num_question"]=df.apply(lambda x: x.text.count("?"), axis=1)
#if the tweet was a retweeted one
df["retweet"] = df.text.apply(lambda x: 1 if x[:2]=="RT" else 0)
#number of characters in original text
df["num_characters"] = df.text.apply(lambda x: len(x))

df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,replied_name,text_new,num_mentions,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,...,lufthansa,"[“lufthansa, wake, amp, explore, kickstart, da...",0,1,22,13,1,0,1,0
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,...,lufthansa,"[thanks, yazz, lt3]",1,0,4,3,0,1,0,0
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,...,lufthansa,"[props, lufthansa, excellent, flights, especia...",1,0,11,7,1,1,1,0
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,...,lufthansa,"[okstill, trying, finalize, booking, better, f...",2,0,26,14,0,1,0,3
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,...,lufthansa,"[already, called, customer, service, helpful, ...",2,0,52,26,0,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18645,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS,3241.0,...,SAS,"[när, kommer, det, finnas, info, om, vilka, de...",1,0,18,17,0,1,0,1
18646,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS,587.0,...,SAS,"[thanks, andrew, thats, time, consuming, curre...",1,0,15,8,0,1,0,1
18647,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS,7.0,...,SAS,"[booked, ticket, fro, oslo, kiev, mine, bestil...",1,0,33,17,0,1,0,0
18648,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS,1953.0,...,SAS,"[turn, purchase, go, normal, state, affairs, w...",1,0,33,12,0,1,0,0


In [None]:
#save the data
df.to_csv("Airlines_conv/Airlines_extended_features.csv")

## Sentiment analysis of tweets

In [31]:
#set hugging face model
model = "cardiffnlp/twitter-roberta-base-sentiment-latest"

#set the pipeline
sentiment_pipeline = pipeline("sentiment-analysis", model = model, tokenizer = model)

#check output for one of the tweets
sentiment_pipeline(df["text"][0])[0]["label"]

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


'positive'

In [33]:
#get sentiment while treating errors
def set_sentimet(text):
    try:
        sentiment = sentiment_pipeline(text)[0]["label"]
    except:
        sentiment = "Error"
    return sentiment

In [34]:
#add sentiment feature to the dataset
df["sentiment"] = df.text.apply(lambda x: set_sentimet(x))
df = df[df.sentiment!="Error"]
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,text_new,num_mentions,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,...,"[“lufthansa, wake, amp, explore, kickstart, da...",0,1,22,13,1,0,1,0,positive
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,...,"[thanks, yazz, lt3]",1,0,4,3,0,1,0,0,positive
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,...,"[props, lufthansa, excellent, flights, especia...",1,0,11,7,1,1,1,0,positive
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,...,"[okstill, trying, finalize, booking, better, f...",2,0,26,14,0,1,0,3,negative
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,...,"[already, called, customer, service, helpful, ...",2,0,52,26,0,2,0,0,negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18645,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS,3241.0,...,"[när, kommer, det, finnas, info, om, vilka, de...",1,0,18,17,0,1,0,1,neutral
18646,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS,587.0,...,"[thanks, andrew, thats, time, consuming, curre...",1,0,15,8,0,1,0,1,negative
18647,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS,7.0,...,"[booked, ticket, fro, oslo, kiev, mine, bestil...",1,0,33,17,0,1,0,0,negative
18648,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS,1953.0,...,"[turn, purchase, go, normal, state, affairs, w...",1,0,33,12,0,1,0,0,negative


In [37]:
#save the data
df.to_csv("Airlines_conv/Airlines_extended_features.csv")

## Topic modelling for tweets

In [7]:
#read the data
df = pd.read_csv("Airlines_conv/Airlines_extended_features.csv", dtype="str", index_col=0)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_links,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,...,1,22,13,1,0,1,0,positive,1,136
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,...,0,4,3,0,1,0,0,positive,0,28
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,...,0,11,7,1,1,1,0,positive,0,95
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,...,0,26,14,0,1,0,3,negative,0,186
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,...,0,52,26,0,2,0,0,negative,0,287


In [8]:
my_stopwords = nltk.corpus.stopwords.words('english')
word_rooter = nltk.stem.snowball.PorterStemmer(ignore_stopwords=False).stem
my_punctuation = '!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~•@'

#process the tweet text
def remove_links(text):
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'bit.ly/\S+', '', text)
    text = text.strip('[link]')
    return text

def remove_users(text):
    text = re.sub('(RT\s@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    text = re.sub('(@[A-Za-z]+[A-Za-z0-9-_]+)', '', text)
    return text

# further cleaning
def process_tweet(tweet, bigrams=False):
    tweet = remove_users(tweet)
    tweet = remove_links(tweet)
    tweet = tweet.lower()
    #strip punctuation
    tweet = re.sub('['+my_punctuation + ']+', ' ', tweet)
    #remove double spacing
    tweet = re.sub('\s+', ' ', tweet)
    #remove numbers
    tweet = re.sub('([0-9]+)', '', tweet)
    #remove stopwords
    tweet_token_list = [word for word in tweet.split(' ')
                            if word not in my_stopwords]

    #apply word rooter
    tweet_token_list = [word_rooter(word) if '#' not in word else word
                        for word in tweet_token_list]

    tweet = ' '.join(tweet_token_list)
    return tweet

#do the processing for topic modeling prep
df['processed_tweet'] = df.text.apply(process_tweet)
df.head(5)

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_full_words,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters,processed_tweet
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,...,22,13,1,0,1,0,positive,1,136,rt “ wake amp explor kickstart day way perfe...
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,...,4,3,0,1,0,0,positive,0,28,thank yazz lt
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,...,11,7,1,1,1,0,positive,0,95,prop lufthansa excel flight especi oversea #f...
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,...,26,14,0,1,0,3,negative,0,186,ok still tri final book better full day seldo...
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,...,52,26,0,2,0,0,negative,0,287,alreadi call custom servic help school cancel...


In [10]:
#assign vectorizer, which get rids of words that are more than in 90% of the tweets and those that are in less than 25.
vectorizer = CountVectorizer(max_df=0.9, min_df=25, token_pattern='\w+|\$[\d\.]+|\S+')

#apply transformation
tf = vectorizer.fit_transform(df['processed_tweet']).toarray()

#get feature names
tf_feature_names = vectorizer.get_feature_names_out()

#assign model
model = LatentDirichletAllocation(n_components=6, random_state=0)

In [11]:
#check that tf was done correctly in terms of tweet dimensions
tf.shape

(18650, 1101)

In [12]:
#fit the model
model.fit(tf)

In [13]:
#Display top n words from topics
def display_topics(model, feature_names, no_top_words):
    topic_dict = {}
    for topic_idx, topic in enumerate(model.components_):
        topic_dict["Topic %d words" % (topic_idx)]= ['{}'.format(feature_names[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
        topic_dict["Topic %d weights" % (topic_idx)]= ['{:.1f}'.format(topic[i])
                        for i in topic.argsort()[:-no_top_words - 1:-1]]
    return pd.DataFrame(topic_dict)

display_topics(model, tf_feature_names, no_top_words=10)

Unnamed: 0,Topic 0 words,Topic 0 weights,Topic 1 words,Topic 1 weights,Topic 2 words,Topic 2 weights,Topic 3 words,Topic 3 weights,Topic 4 words,Topic 4 weights,Topic 5 words,Topic 5 weights
0,’t,852.3,book,684.1,pleas,837.4,det,426.2,fli,786.7,flight,2278.5
1,’,676.0,flight,392.4,custom,789.4,de,302.2,thank,617.1,refund,1474.3
2,i,465.8,check,362.3,call,724.9,att,260.2,much,334.5,thank,1062.5
3,sjpeac,421.2,one,310.2,servic,717.8,en,244.2,seat,334.2,cancel,1010.9
4,custom,417.4,amp,262.1,tri,700.2,på,237.2,great,290.8,still,642.1
5,it,348.9,airport,260.1,email,656.6,int,221.2,would,274.7,wait,565.3
6,staff,313.9,case,248.6,thank,630.1,jag,207.2,look,262.5,get,471.9
7,don,298.7,id,236.1,help,580.4,är,198.2,flight,253.0,day,402.1
8,servic,288.0,chang,219.3,answer,545.4,och,197.2,travel,247.2,th,370.1
9,flight,234.4,airlin,188.2,contact,533.0,die,191.2,like,223.0,back,332.7


In [14]:
#assign topics to tweets
topic_probabilities = model.transform(tf)
topic_list = []
for i in topic_probabilities:
   topic_list.append(str(i.argmax()))

topic_df = pd.DataFrame(topic_list, columns=['topic'])

df["topic"] = topic_df["topic"]
df

Unnamed: 0,conversation_id,id,author_id,text,in_reply_to_user_id,replied_to,attachment,label,relatedNGO,author_followers,...,num_tokenized_words,num_hashtags,num_NGO_mentions,num_exclamation,num_question,sentiment,retweet,num_characters,processed_tweet,topic
0,508892469325742080,509079299337379840,25053194,RT “@lufthansa: Wake up &amp; explore! Kicksta...,124476322,508892469325742080,1,0,lufthansa,3043.0,...,13,1,0,1,0,positive,1,136,rt “ wake amp explor kickstart day way perfe...,1
1,864913478611611648,864921399571382272,850768472082702337,@lufthansa thanks Yazz &lt;3,124476322,864921345821466624,0,0,lufthansa,628.0,...,3,0,1,0,0,positive,0,28,thank yazz lt,5
2,816245904915238912,816249093450301441,1260005671,"@lufthansa props to you Lufthansa, for excelle...",124476322,816247852221550596,0,0,lufthansa,4316.0,...,7,1,1,1,0,positive,0,95,prop lufthansa excel flight especi oversea #f...,4
3,1412003542534168577,1412129712336523278,164208360,@lufthansa @Lufthansa_DE Ok...still trying to ...,124476322,1412010759777931267,0,0,lufthansa,220.0,...,14,0,1,0,3,negative,0,186,ok still tri final book better full day seldo...,5
4,1235804900107395072,1235806840245583872,1105305437900075009,@lufthansa @lufthansa I already called custome...,124476322,1235806289264959488,0,0,lufthansa,0.0,...,26,0,2,0,0,negative,0,287,alreadi call custom servic help school cancel...,5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18645,1239300277183922177,1239453966481661952,2437384340,@SAS När kommer det finnas info om vilka desti...,1379801,1239300277183922177,0,1,SAS,3241.0,...,17,0,1,0,1,neutral,0,109,när kommer det finna info om vilka destinatio...,3
18646,820935248687075328,820983985383833601,125634221,@SAS Thanks Andrew but thats very time consumi...,1379801,820945772522983426,0,1,SAS,587.0,...,8,0,1,0,1,negative,0,90,thank andrew that time consum current set alt...,4
18647,1295746033033654272,1296073564035178503,1243992961840566272,@SAS I have just booked a ticket fro Oslo to K...,1379801,1295746033033654272,0,1,SAS,7.0,...,17,0,1,0,0,negative,0,174,book ticket fro oslo kiev mine bestilling not...,1
18648,943881917149138944,943884189329436672,491431191,@SAS I had to turn it off for the purchase to ...,1379801,943882605287673857,0,1,SAS,1953.0,...,12,0,1,0,0,negative,0,152,turn purchas go normal state affair would sus...,5


In [15]:
#Drop processed tweet column
df = df.drop(["processed_tweet"], axis=1)

In [16]:
#save the data
df.to_csv("Airlines_conv/Airlines_extended_features.csv")