# Business idea

This file contains all the plots we made related to our business idea. 

##### Business idea statistics 

- [The timeframe](#The-timeframe)
- [The normal topics dataframe](#Dataframe-for-all-the-tweets-in-a-conversation)
- [The dataframes per airline](#Dataframes-per-Airline)
- [Number of tweets per topic (in presentation)](#Number-of-tweets-per-topic)
- [Number of tweets per airline](#Number-of-tweets-per-airline-that-have-a-topic)
- [Number of tweets per topic for all airlines separately](#Number-of-tweets-per-topic-per-Airline)
- [Percentage of tweets per airline for all topics separately](#Percentage-of-tweets-per-topic-for-all-airlines)
- [Stacked bar chart with airlines and topics (in presentation)](#Stacked-bar-chart-topics-and-airlines)
- [Coocurring topics](#Number-of_coocurring-topics)

In [None]:
# Imports and setting up MySQL

import mysql.connector
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from config import config

connection = mysql.connector.connect(
    host=config.get('HOST'),
    user=config.get('USERNAME'),
    password=config.get('PASSWORD'),
    database=config.get('DATABASE')
)

cursor = connection.cursor()

### The timeframe

In [None]:
# Input for now is year-month-day
timeframe = {'start': ['2019', '06', '10'], 'end':['2020', '01', '12']}
timeframe_start = f'{timeframe['start'][2]}/{timeframe['start'][1]}/{timeframe['start'][0]}'
timeframe_end = f'{timeframe['end'][2]}/{timeframe['end'][1]}/{timeframe['end'][0]}'

start_date = pd.to_datetime(timeframe_start)
end_date = pd.to_datetime(timeframe_end)

### Dataframe for all the tweets in a conversation

In [None]:
conv_topics1 = """
    SELECT text, id, staff, baggage, delay_and_cancellation, money, timestamp_ms
    FROM tweets 
    WHERE TRIM(language) = 'en' AND (mentioned_airlines LIKE '%Lufthansa%' OR mentioned_airlines LIKE '%British_Airways%' OR mentioned_airlines LIKE '%KLM%' OR mentioned_airlines LIKE '%AirFrance%')
"""
conv_topics2 = """
    SELECT tweets.text, tweets.id, tweets.staff, tweets.baggage, tweets.delay_and_cancellation, tweets.money, tweets.timestamp_ms
    FROM tweets 
    JOIN hasher ON tweets.id = hasher.id
    JOIN conversations ON hasher.conversation_id = conversations.conversation_id
    WHERE TRIM(language) = 'en' AND conversations.conversation_id IN (
        SELECT conv.conversation_id
        FROM conversations AS conv
        WHERE conv.airline LIKE '%Lufthansa%' OR conv.airline LIKE '%British_Airways%' OR conv.airline LIKE '%KLM%' OR conv.airline LIKE '%AirFrance%'
    )
"""
df_topics1 = pd.read_sql(conv_topics1, connection)
df_topics2 = pd.read_sql(conv_topics2, connection)

df_topics = pd.concat([df_topics1, df_topics2], ignore_index=True, axis=0)
df_topics = df_topics.drop_duplicates()

df_topics['time'] = pd.to_datetime(df_topics['timestamp_ms'], unit='ms')

Creates a new dataframe with all the tweets in a certain time period

In [None]:
mask = (df_topics['time'] >= start_date) & (df_topics['time'] <= end_date)
df_topics_time = df_topics.loc[mask].copy()

In [None]:
# Drops all the columns that we do not need anymore

columns_drop = ['text', 'id', 'timestamp_ms', 'time']
df_topics_time = df_topics_time.drop(columns=columns_drop)

### Dataframes per Airline

In [None]:
# Creates the Dataframe for British Airways

conv_ba1 = """
    SELECT text, id, staff, baggage, delay_and_cancellation, money, timestamp_ms
    FROM tweets 
    WHERE TRIM(language) = 'en' AND mentioned_airlines LIKE '%British_Airways%'
"""
conv_ba2 = """
    SELECT tweets.text, tweets.id, tweets.staff, tweets.baggage, tweets.delay_and_cancellation, tweets.money, tweets.timestamp_ms
    FROM tweets 
    JOIN hasher ON tweets.id = hasher.id
    JOIN conversations ON hasher.conversation_id = conversations.conversation_id
    WHERE TRIM(language) = 'en' AND conversations.conversation_id IN (
        SELECT conv.conversation_id
        FROM conversations AS conv
        WHERE conv.airline LIKE '%British_Airways%'
    )
"""

df_ba1 = pd.read_sql(conv_ba1, connection)
df_ba2 = pd.read_sql(conv_ba2, connection)

df_ba = pd.concat([df_ba1, df_ba2], ignore_index=True, axis=0)
df_ba = df_ba.drop_duplicates()

df_ba['time'] = pd.to_datetime(df_ba['timestamp_ms'], unit='ms')

In [None]:
# Creates the dataframe for AirFrance

conv_af1 = """
    SELECT text, id, staff, baggage, delay_and_cancellation, money, timestamp_ms
    FROM tweets 
    WHERE TRIM(language) = 'en' AND mentioned_airlines LIKE '%AirFrance%'
"""
conv_af2 = """
    SELECT tweets.text, tweets.id, tweets.staff, tweets.baggage, tweets.delay_and_cancellation, tweets.money, tweets.timestamp_ms
    FROM tweets 
    JOIN hasher ON tweets.id = hasher.id
    JOIN conversations ON hasher.conversation_id = conversations.conversation_id
    WHERE TRIM(language) = 'en' AND conversations.conversation_id IN (
        SELECT conv.conversation_id
        FROM conversations AS conv
        WHERE conv.airline LIKE '%AirFrance%'
    )
"""

df_af1 = pd.read_sql(conv_af1, connection)
df_af2 = pd.read_sql(conv_af2, connection)

df_af = pd.concat([df_af1, df_af2], ignore_index=True, axis=0)
df_af = df_af.drop_duplicates()

df_af['time'] = pd.to_datetime(df_af['timestamp_ms'], unit='ms')

In [None]:
# Creates the dataframe for KLM

conv_klm1 = """
    SELECT text, id, staff, baggage, delay_and_cancellation, money, timestamp_ms
    FROM tweets 
    WHERE TRIM(language) = 'en' AND mentioned_airlines LIKE '%KLM%'
"""
conv_klm2 = """
    SELECT tweets.text, tweets.id, tweets.staff, tweets.baggage, tweets.delay_and_cancellation, tweets.money, tweets.timestamp_ms
    FROM tweets 
    JOIN hasher ON tweets.id = hasher.id
    JOIN conversations ON hasher.conversation_id = conversations.conversation_id
    WHERE TRIM(language) = 'en' AND conversations.conversation_id IN (
        SELECT conv.conversation_id
        FROM conversations AS conv
        WHERE conv.airline LIKE '%KLM%'
    )
"""

df_klm1 = pd.read_sql(conv_klm1, connection)
df_klm2 = pd.read_sql(conv_klm2, connection)

df_klm = pd.concat([df_klm1, df_klm2], ignore_index=True, axis=0)
df_klm = df_klm.drop_duplicates()

df_klm['time'] = pd.to_datetime(df_klm['timestamp_ms'], unit='ms')

In [None]:
# Creates the dataframe for Lufthansa

conv_lh1 = """
    SELECT text, id, staff, baggage, delay_and_cancellation, money, timestamp_ms
    FROM tweets 
    WHERE TRIM(language) = 'en' AND mentioned_airlines LIKE '%Lufthansa%'
"""
conv_lh2 = """
    SELECT tweets.text, tweets.id, tweets.staff, tweets.baggage, tweets.delay_and_cancellation, tweets.money, tweets.timestamp_ms
    FROM tweets 
    JOIN hasher ON tweets.id = hasher.id
    JOIN conversations ON hasher.conversation_id = conversations.conversation_id
    WHERE TRIM(language) = 'en' AND conversations.conversation_id IN (
        SELECT conv.conversation_id
        FROM conversations AS conv
        WHERE conv.airline LIKE '%Lufthansa%'
    )
"""

df_lh1 = pd.read_sql(conv_lh1, connection)
df_lh2 = pd.read_sql(conv_lh2, connection)

df_lh = pd.concat([df_lh1, df_lh2], ignore_index=True, axis=0)
df_lh = df_lh.drop_duplicates()

df_lh['time'] = pd.to_datetime(df_lh['timestamp_ms'], unit='ms')

In [None]:
# Create the masks for the right timeframe

mask_ba = (df_ba['time'] >= start_date) & (df_ba['time'] <= end_date)
mask_af = (df_af['time'] >= start_date) & (df_af['time'] <= end_date)
mask_klm = (df_klm['time'] >= start_date) & (df_klm['time'] <= end_date)
mask_lh = (df_lh['time'] >= start_date) & (df_lh['time'] <= end_date)

In [None]:
# Gives dataframes with entries that are in the right timeframe

df_ba_time = df_ba.loc[mask_ba]
df_af_time = df_af.loc[mask_af]
df_klm_time = df_klm.loc[mask_klm]
df_lh_time = df_lh.loc[mask_lh]

In [None]:
# Drops all the columns we do not need anymore

df_topics_ba = df_ba_time.drop(columns=columns_drop)
df_topics_af = df_af_time.drop(columns=columns_drop)
df_topics_klm = df_klm_time.drop(columns=columns_drop)
df_topics_lh = df_lh_time.drop(columns=columns_drop)

# Plots

### Number of tweets per topic

In [None]:
counts_topics = df_topics_time.apply(lambda col: col.value_counts().get(1, 0))
print(counts_topics)

plt.figure(figsize=(10,6))
counts_topics.plot(kind='bar', color=sns.color_palette('colorblind'))
plt.xlabel('topics')
plt.ylabel('number of tweets')
plt.title('Number of tweets per topic', weight='bold')
plt.xticks(rotation=0)
plt.show()

### Number of tweets per airline that have a topic

In [None]:
# Get the number of tweets per airline that have a topic

counts_ba = df_topics_ba.apply(lambda col: col.value_counts().get(1, 0))
counts_af = df_topics_af.apply(lambda col: col.value_counts().get(1, 0))
counts_klm = df_topics_klm.apply(lambda col: col.value_counts().get(1, 0))
counts_lh = df_topics_lh.apply(lambda col: col.value_counts().get(1, 0))

counts_ba_sum = counts_ba.sum()
counts_af_sum = counts_af.sum()
counts_klm_sum = counts_klm.sum()
counts_lh_sum = counts_lh.sum()

In [None]:
combined_counts = pd.DataFrame({
    'Airline': ['British_Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Count': [counts_ba_sum, counts_af_sum, counts_klm_sum, counts_lh_sum]
})

colors = ['skyblue', 'red', 'orange', 'pink']

plt.figure(figsize=(10, 6))
combined_counts.plot(kind='bar', x='Airline', y='Count', color=colors, legend=False)
plt.xlabel('Airline')
plt.ylabel('Number of tweets')
plt.title('Number of tweets per airline belonging to a topic', weight = 'bold')
plt.xticks(rotation=0)
plt.show()

### Number of tweets per topic per Airline

##### British Airways

In [None]:
colors = ['blue', 'green', 'purple', 'orange']

In [None]:
plt.figure(figsize=(10,6))
counts_ba.plot(kind='bar', color=colors)
plt.xlabel('topics')
plt.ylabel('number of tweets')
plt.title('Number of tweets per topic for British Airways', weight='bold')
plt.xticks(rotation=0)
plt.show()

##### AirFrance

In [None]:
plt.figure(figsize=(10,6))
counts_af.plot(kind='bar', color=colors)
plt.xlabel('topics')
plt.ylabel('number of tweets')
plt.title('Number of tweets per topic for AirFrance', weight='bold')
plt.xticks(rotation=0)
plt.show()

##### KLM

In [None]:
plt.figure(figsize=(10,6))
counts_klm.plot(kind='bar', color=colors)
plt.xlabel('topics')
plt.ylabel('number of tweets')
plt.title('Number of tweets per topic for KLM', weight='bold')
plt.xticks(rotation=0)
plt.show()

##### Lufthansa

In [None]:
plt.figure(figsize=(10,6))
counts_lh.plot(kind='bar', color=colors)
plt.xlabel('topics')
plt.ylabel('number of tweets')
plt.title('Number of tweets per topic for Lufthansa', weight='bold')
plt.xticks(rotation=0)
plt.show()

### Percentage of tweets per topic for all airlines

In [None]:
# Get a series object with the percentages for each topic for each airline

percentages_ba = round((counts_ba/counts_ba_sum)*100, 2)
percentages_af = round((counts_af/counts_af_sum)*100, 2)
percentages_klm = round((counts_klm/counts_klm_sum)*100, 2)
percentages_lh = round((counts_lh/counts_lh_sum)*100, 2)

##### Staff

In [None]:
per_staff_ba = percentages_ba[0]
per_staff_af = percentages_af[0]
per_staff_klm = percentages_klm[0]
per_staff_lh = percentages_lh[0]

percentages_staff = pd.DataFrame({
    'Airline': ['British Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Percent_staff': [per_staff_ba, per_staff_af, per_staff_klm, per_staff_lh]
})

colors_airlines = ['skyblue', 'red', 'orange', 'pink']

plt.figure(figsize=(10, 6))
percentages_staff.plot(kind='bar', x='Airline', y='Percent_staff', color=colors_airlines, legend=False)
plt.xlabel('Airline')
plt.ylabel('Percentage of staff tweets (%)')
plt.title('Percentage of tweets about staff per airline', weight = 'bold')
plt.xticks(rotation=0)
plt.show()



##### Baggage

In [None]:
per_bagg_ba = percentages_ba[1]
per_bagg_af = percentages_af[1]
per_bagg_klm = percentages_klm[1]
per_bagg_lh = percentages_lh[1]

percentages_bagg = pd.DataFrame({
    'Airline': ['British Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Percent_bagg': [per_bagg_ba, per_bagg_af, per_bagg_klm, per_bagg_lh]
})

colors_airlines = ['skyblue', 'red', 'orange', 'pink']

plt.figure(figsize=(10, 6))
percentages_bagg.plot(kind='bar', x='Airline', y='Percent_bagg', color=colors_airlines, legend=False)
plt.xlabel('Airline')
plt.ylabel('Percentage of baggage tweets (%)')
plt.title('Percentage of tweets about baggage per airline', weight = 'bold')
plt.xticks(rotation=0)
plt.show()

##### Delay and Cancellation

In [None]:
per_delay_ba = percentages_ba[2]
per_delay_af = percentages_af[2]
per_delay_klm = percentages_klm[2]
per_delay_lh = percentages_lh[2]

percentages_delay = pd.DataFrame({
    'Airline': ['British Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Percent_delay': [per_delay_ba, per_delay_af, per_delay_klm, per_delay_lh]
})

colors_airlines = ['skyblue', 'red', 'orange', 'pink']

plt.figure(figsize=(10, 6))
percentages_delay.plot(kind='bar', x='Airline', y='Percent_delay', color=colors_airlines, legend=False)
plt.xlabel('Airline')
plt.ylabel('Percentage of delay and cancellation tweets (%)')
plt.title('Percentage of tweets about delay and cancellations per airline', weight = 'bold')
plt.xticks(rotation=0)
plt.show()

##### Money

In [None]:
per_money_ba = percentages_ba[3]
per_money_af = percentages_af[3]
per_money_klm = percentages_klm[3]
per_money_lh = percentages_lh[3]

percentages_money = pd.DataFrame({
    'Airline': ['British Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Percent_money': [per_money_ba, per_money_af, per_money_klm, per_money_lh]
})

colors_airlines = ['skyblue', 'red', 'orange', 'pink']

plt.figure(figsize=(10, 6))
percentages_money.plot(kind='bar', x='Airline', y='Percent_money', color=colors_airlines, legend=False)
plt.xlabel('Airline')
plt.ylabel('Percentage of money tweets (%)')
plt.title('Percentage of tweets about money per airline', weight = 'bold')
plt.xticks(rotation=0)
plt.show()

### Stacked bar chart topics and airlines

In [None]:
percentages_ba = round((counts_ba / counts_ba_sum) * 100, 2)
percentages_af = round((counts_af / counts_af_sum) * 100, 2)
percentages_klm = round((counts_klm / counts_klm_sum) * 100, 2)
percentages_lh = round((counts_lh / counts_lh_sum) * 100, 2)

percentages = pd.DataFrame({
    'Airline': ['British Airways', 'AirFrance', 'KLM', 'Lufthansa'],
    'Staff': [percentages_ba[0], percentages_af[0], percentages_klm[0], percentages_lh[0]],
    'Baggage': [percentages_ba[1], percentages_af[1], percentages_klm[1], percentages_lh[1]],
    'Delay and Cancellation': [percentages_ba[2], percentages_af[2], percentages_klm[2], percentages_lh[2]],
    'Money': [percentages_ba[3], percentages_af[3], percentages_klm[3], percentages_lh[3]]
})


percentages.set_index('Airline').plot(kind='bar', stacked=True, figsize=(10, 6), color=sns.color_palette('colorblind'))


plt.xlabel('Airline')
plt.ylabel('Percentage of Tweets')
plt.title('Percentage of Tweets per Topic by Airline', weight='bold')
plt.xticks(rotation=0)
plt.legend(title='Topics', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()


plt.show()

# Number of cooccuring topics

### Number of tweets that contain two topics

In [None]:
count_bdc = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['money'] == 0) & (df_topics_time['staff'] == 0)].count()[0]
count_bm = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['money'] == 1) & (df_topics_time['staff'] == 0) & (df_topics_time['delay_and_cancellation'] == 0)].count()[0]
count_bs = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['money'] == 0) & (df_topics_time['delay_and_cancellation'] == 0)].count()[0]
count_mdc = df_topics_time[(df_topics_time['money'] == 1) & (df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['staff'] == 0) & (df_topics_time['baggage'] == 0)].count()[0]
count_ms = df_topics_time[(df_topics_time['money'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['delay_and_cancellation'] == 0) & (df_topics_time['baggage'] == 0)].count()[0]
count_sdc = df_topics_time[(df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['money'] == 0) & (df_topics_time['baggage'] == 0)].count()[0]

In [None]:
counts_two_topics = pd.DataFrame({
    'Topics': ['baggage - delay and cancellation','baggage - money', 'baggage - staff', 'money - delay and cancellation', 'money - staff', 'staff - delay and cancellation'],
    'Counts': [count_bdc, count_bm, count_bs, count_mdc, count_ms, count_sdc]
})

plt.figure(figsize=(20, 6))
counts_two_topics.plot(kind='bar', x='Topics', y='Counts', color=sns.color_palette('colorblind'), legend=False)
plt.xlabel('Combination of topics')
plt.ylabel('Number of tweets')
plt.title('Number of tweets per pair of topics', weight = 'bold')
#plt.xticks(rotation=0)
plt.show()


### Number of tweets about three topics

In [None]:
count_bdcs = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['money'] == 0)].count()[0]
count_bdcm = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['delay_and_cancellation'] == 1)& (df_topics_time['money'] == 1) & (df_topics_time['staff'] == 0)].count()[0]
count_bsm = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['money'] == 1) & (df_topics_time['delay_and_cancellation'] == 0)].count()[0]
count_dcsm = df_topics_time[(df_topics_time['baggage'] == 0) & (df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['staff'] == 1) & (df_topics_time['money'] == 1)].count()[0]

In [None]:
counts_three_topics = pd.DataFrame({
    'Topics': ['baggage - delay and cancellation - money','baggage - money - staff', 'baggage - staff - delay and cancellation', 'money - delay and cancellation - staff'],
    'Counts': [count_bdcm, count_bsm, count_bdcs, count_dcsm]
})

plt.figure(figsize=(20, 6))
counts_three_topics.plot(kind='bar', x='Topics', y='Counts', color=sns.color_palette('colorblind'), legend=False)
plt.xlabel('Combination of topics')
plt.ylabel('Number of tweets')
plt.title('Number of tweets per triplet of topics', weight = 'bold')
#plt.xticks(rotation=0)
plt.show()

### Number of tweets about all four topics

In [None]:
count_all = df_topics_time[(df_topics_time['baggage'] == 1) & (df_topics_time['delay_and_cancellation'] == 1) & (df_topics_time['money'] == 1) & (df_topics_time['staff'] == 1)].count()[0]
print(count_all)