In [None]:
import os
import glob
import pandas as pd
import numpy as np
import filenames

import plotly.express as px
from jupyter_dash import JupyterDash
from dash import dcc
from dash import html
from dash.dependencies import Input, Output
import plotly.graph_objects as go

pd.set_option('display.max_rows', 500, 'display.max_columns', 500,
              'display.width', 1000)

### Data

use glob to get all the csv files in the raw data folder.

In [None]:
profile_files = filenames.profile_folder_path.glob(os.path.join("*.csv"))

profile_appended_data = []
# loop over the list of csv files
for f in profile_files:
    data = pd.read_csv(f)
    profile_appended_data.append(data)
#profile_appended_data

df = pd.concat(profile_appended_data)
df.reset_index(drop=True, inplace=True)

#### Drop duplicate userid

In [None]:
df = df.drop_duplicates(subset=['userid'], keep='last').reset_index(drop=True)

#### Create Label for Followers

In [None]:
import csv
fpath = filenames.followers_path
follower = []
with open(fpath, newline='') as f:
    for i in csv.reader(f):
        follower.append(i[0])

In [None]:
df['is_follower'] = df['username'].isin(follower).astype(int)

#### Detect Language of biography

In [None]:
#! pip install https://github.com/aboSamoor/pycld2/zipball/e3ac86ed4d4902e912691c1531d0c5645382a726
# https://towardsdatascience.com/4-python-libraries-to-detect-english-and-non-english-language-c82ad3efd430

In [None]:
import pycld2 as cld2
def detect_language(row):
    isReliable, textBytesFound, details, vectors = cld2.detect(
        row, returnVectors=True)
    return vectors

In [None]:
df["language_detect"] = [
    detect_language(df['biography'][i])
    if pd.notnull(df['biography'][i]) else np.NaN
    for i in range(0, (df['biography'].shape[0]))
]

In [None]:
df['language'] = df['language_detect'].apply(
    lambda x: x if pd.isna(x) else [i[2] for i in x])

In [None]:
df['language'] = df['language'].apply(lambda x: str(x).strip('[').strip(']'))

In [None]:
language_df = df['language'].str.split(', ', expand=True)[0].value_counts().to_frame().reset_index()
language_df.columns =['Language', 'Count']

In [None]:
fig = go.Figure()
fig.add_trace(go.Bar(x = language_df['Language'], y = language_df['Count'], text= language_df['Count'],
            textposition='auto', marker_color='lightslategrey',
                name='revenue' ))
fig.update_traces(texttemplate='%{text:}', textposition='outside')
fig.update_layout(uniformtext_minsize=8, uniformtext_mode='hide')

fig.update_layout(
    xaxis=dict(
        showline=True,
        showgrid=False,
        showticklabels=True,
        linecolor='rgb(204, 204, 204)',
        linewidth=2,
        ticks='outside',
        tickfont=dict(
            family='Arial',
            size=12,
            color='rgb(82, 82, 82)',
        ),
    ),
    yaxis=dict(
        showgrid=False,
        zeroline=False,
        showline=False,
        showticklabels=False,
        showspikes=False
    ),
    autosize=True,
    margin=dict(
        autoexpand=False,
        l=100,
        r=20,
        t=110,
    ),
    font_family="Courier New",
    title_font_family="Times New Roman",
    hovermode="x unified",
    legend_title_text='Discovery',
    legend = dict(
    yanchor="top",
    y=0.99,
    xanchor="left",
    x=0.01,
    font=dict(
            family="Arial",
            size=12,
            color="black"
    ),
    ),
    plot_bgcolor='white'
)

In [None]:
df.columns

In [None]:
import plotly.figure_factory as ff

In [None]:
x1 = df[df['mediacount'] <= 1000]['mediacount'].values
x2 = df[df['followers'] <=5000]['followers'].values
x3 = df[df['followees'] <= 5000]['followees'].values


# Group data together
hist_data = [x1, x2, x3]

group_labels = ['MediaCount', 'Followers' , 'Followees']

# Create distplot with custom bin_size
fig = ff.create_distplot(hist_data, group_labels, bin_size=[50, 10, 10])
fig.show()

The amount of posts is not an indication of number of followers or followees for a profile. 
There are many features that can have an impact. e.g., how many years the profile is active, deleting their posts over time and popularity of the person etc.