In [1]:
import math
import numpy as np
import pandas as pd
import scipy.stats as stats
import matplotlib.pyplot as plt
import seaborn as sns

from datetime import datetime 
from collections import defaultdict
from scipy.stats import pearsonr

# Tweets data preparation

In [None]:
tweets = pd.read_csv('tweets.csv')

In [40]:
tweets['id'] = pd.to_numeric(tweets['id'], errors='coerce')
tweets['user_id'] = pd.to_numeric(tweets['user_id'], errors='coerce')
tweets['retweet_count'] = pd.to_numeric(tweets['retweet_count'], errors='coerce')
tweets['reply_count'] = pd.to_numeric(tweets['reply_count'], errors='coerce')
tweets['favorite_count'] = pd.to_numeric(tweets['favorite_count'], errors='coerce')
tweets['num_hashtags'] = pd.to_numeric(tweets['num_hashtags'], errors='coerce')
tweets['num_mentions'] = pd.to_numeric(tweets['num_mentions'], errors='coerce')
tweets['num_urls'] = pd.to_numeric(tweets['num_urls'], errors='coerce')
tweets['created_at'] = pd.to_datetime(tweets['created_at'], errors='coerce')

In [None]:
tweets['reply_count'].describe()

In [None]:
tweets['reply_count'].unique()

In [None]:
tweets[tweets['reply_count'] > tweets['reply_count'].quantile(.999)]['reply_count'].unique()

## Eliminazione dei valori negativi

In [None]:
tweets['retweet_count'] = tweets['retweet_count'].abs()
tweets['reply_count'] = tweets['reply_count'].abs()
tweets['favorite_count'] = tweets['favorite_count'].abs()
tweets['num_hashtags'] = tweets['num_hashtags'].abs()
tweets['num_mentions'] = tweets['num_mentions'].abs()
tweets['num_urls'] = tweets['num_urls'].abs()

## Eliminazione dei valori inf

In [None]:
tweets.replace(math.inf, math.nan, inplace=True)
tweets.replace('inf', math.nan, inplace=True)

## Manage duplicates:

### Dropping duplicates:

In [None]:
tweets.drop_duplicates(inplace=True)

### Find and replace duplicated 'id'

In [None]:
mask = tweets.duplicated('id', keep=False)
tweets.loc[mask, 'id'] = -1

## Replacing NaN values:

### Replace NaN values in 'id'

Replace nan values with a value (-1) that indicates that this information is missing

In [None]:
tweets.loc[tweets['id'].isna(), 'id'] = -1

### Replacing NaN values in 'num_hashtags', 'num_urls' and 'num_mentions' by infer from the tweet's text

In [None]:
tweets['infer_hashtags'] = tweets.text.str.count('#')
tweets['infer_mentions'] = tweets.text.str.count('@')
tweets['infer_urls'] = tweets.text.str.count('http://')

tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['infer_hashtags'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['infer_mentions'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['infer_urls'])

tweets.drop(columns=['infer_hashtags', 'infer_mentions', 'infer_urls'], inplace=True)

### Replacing NaN values with the median of the user

In [None]:
t = tweets
user_median = t.groupby('user_id', as_index=False).median()
user_median.rename(columns={
    'retweet_count' : 'retweet_median',
    'reply_count' : 'reply_median',
    'favorite_count' : 'favorite_median',
    'num_hashtags' : 'hashtags_median',
    'num_urls' : 'urls_median',
    'num_mentions' : 'mentions_median'
}, inplace=True)
user_median.drop(columns=['id'], inplace=True)
tweets = t.merge(user_median, on='user_id')

#sostituzione valori
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_median'])
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_median'])
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_median'])
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['hashtags_median'])
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['mentions_median'])
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['urls_median'])

tweets.drop(columns=['retweet_median', 'reply_median', 'favorite_median', 'hashtags_median', 'mentions_median', 'urls_median'], \
            inplace=True)


### Fill the remained NaN values with the median of the attribute

In [None]:
tweets['retweet_count'] = tweets['retweet_count'].fillna(tweets['retweet_count'].median())
tweets['reply_count'] = tweets['reply_count'].fillna(tweets['reply_count'].median())
tweets['favorite_count'] = tweets['favorite_count'].fillna(tweets['favorite_count'].median())
tweets['num_hashtags'] = tweets['num_hashtags'].fillna(tweets['num_hashtags'].median())
tweets['num_mentions'] = tweets['num_mentions'].fillna(tweets['num_mentions'].median())
tweets['num_urls'] = tweets['num_urls'].fillna(tweets['num_urls'].median())

## Replacing invalid 'created_at' with symbolic date (01/01/2000)

In [None]:
tweets.loc[(tweets.created_at < datetime.strptime("2006-07-15 00:00:00", "%Y-%m-%d %H:%M:%S")) | \
    (tweets.created_at > datetime.now()),'created_at'] = datetime.strptime("2000-01-01 00:00:00", "%Y-%m-%d %H:%M:%S")

In [None]:
backup = tweets

In [None]:
tweets = backup

## Manage outliers

In [None]:
tweets.loc[tweets['retweet_count'] > tweets['retweet_count'].quantile(.95), 'retweet_count'] = tweets['retweet_count'].median()
tweets.loc[tweets['reply_count'] > tweets['reply_count'].quantile(.9999), 'reply_count'] = tweets['reply_count'].median()
tweets.loc[tweets['favorite_count'] > tweets['favorite_count'].quantile(.995), 'favorite_count'] = tweets['favorite_count'].median()
tweets.loc[tweets['num_hashtags'] > tweets['num_hashtags'].quantile(.999), 'num_hashtags'] = tweets['num_hashtags'].median()
tweets.loc[tweets['num_mentions'] > tweets['num_mentions'].quantile(.999), 'num_mentions'] = tweets['num_mentions'].median()
tweets.loc[tweets['num_urls'] > tweets['num_urls'].quantile(.9999), 'num_urls'] = tweets['num_urls'].median()


In [None]:
fig = plt.figure(figsize=(20, 23))
fig_dims = (3, 2)
fig.subplots_adjust(hspace=0.4, wspace=0.4)

######
plt.subplot2grid(fig_dims, (0,0))
tweets.boxplot(column=['favorite_count'])

######
plt.subplot2grid(fig_dims, (0,1))
tweets.boxplot(column=['retweet_count'])

######
plt.subplot2grid(fig_dims, (1,0))
tweets.boxplot(column=['reply_count'])

######
plt.subplot2grid(fig_dims, (1,1))
tweets.boxplot(column=['num_urls'])

######
plt.subplot2grid(fig_dims, (2,0))
tweets.boxplot(column=['num_hashtags'])

######
plt.subplot2grid(fig_dims, (2,1))
tweets.boxplot(column=['num_mentions'])

## Tweets indicators

### Number of special character:

In [None]:
special = '[(|!£$%&/=?^@#§,.;:-_<>ç@)]' # Define special characters
tweets['special'] = tweets['text'].str.count(special) # Count them
tweets['special'] = tweets['special'].fillna(0)
tweets['special'] = tweets['special'].astype(int)

### Text lenght:

In [None]:
tweets['text_lenght'] = tweets['text'].str.len()
tweets['text_lenght'] = tweets['text_lenght'].fillna(0)
tweets['text_lenght'] = tweets['text_lenght'].astype(int)

In [None]:
tweets['reply_count'].describe()

## Write a new file '.csv' with all the modify done

In [None]:
tweets.to_csv('new_tweets.csv')

# User data Preparation

This part use the previous modify done to the tweets dataframe.

In [22]:
users = pd.read_csv('users.csv')
tweets = pd.read_csv('new_tweets.csv', lineterminator='\n', index_col=0)

In [23]:
users.info()
users.dtypes

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11508 entries, 0 to 11507
Data columns (total 6 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              11508 non-null  int64  
 1   name            11507 non-null  object 
 2   lang            11508 non-null  object 
 3   bot             11508 non-null  int64  
 4   created_at      11508 non-null  object 
 5   statuses_count  11109 non-null  float64
dtypes: float64(1), int64(2), object(3)
memory usage: 539.6+ KB


id                  int64
name               object
lang               object
bot                 int64
created_at         object
statuses_count    float64
dtype: object

## Column type conversion

In [24]:
users['id'] = pd.to_numeric(users['id'], errors='coerce')
users['statuses_count'] = pd.to_numeric(users['statuses_count'], errors='coerce')
users['created_at'] = pd.to_datetime(users['created_at'], errors='coerce')

## Elimination of negative values

In [25]:

users['statuses_count'] = users['statuses_count'].abs()

## Lang Correction

In [26]:
users['lang'] = users['lang'].str.lower()
users.rename(columns={
    'id' : 'user_id',
    'created_at' : 'subscribing_date'
}, inplace=True)
nal = users[(users['lang'] == 'select language...') | (users['lang'] == 'xx-lc')]

text_nal = tweets.merge(nal, on='user_id')
text_nal



Unnamed: 0,id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text,special,text_lenght,name,lang,bot,subscribing_date,statuses_count
0,4.912745e+17,2.955215e+07,0.0,0.0,0.0,0.0,0.0,0.0,2019-07-23 17:32:30,"Spent the majority of my day drawing around, c...",10,111,Chloe,xx-lc,0,2014-04-09 21:36:02,15398.0
1,4.886930e+17,2.955215e+07,0.0,0.0,0.0,0.0,0.0,0.0,2019-07-16 14:34:25,We all love dem cuddles. http://t.co/fPrcnUfacS,10,47,Chloe,xx-lc,0,2014-04-09 21:36:02,15398.0
2,4.854109e+17,2.955215e+07,0.0,0.0,0.0,0.0,0.0,0.0,2019-07-07 13:12:33,Dog sitting until 8 tonight. Again..,5,36,Chloe,xx-lc,0,2014-04-09 21:36:02,15398.0
3,1.683542e+15,2.955215e+07,0.0,0.0,0.0,0.0,0.0,0.0,2019-07-16 20:58:50,,0,0,Chloe,xx-lc,0,2014-04-09 21:36:02,15398.0
4,7.055459e+09,2.955215e+07,0.0,0.0,0.0,0.0,0.0,0.0,2020-02-27 20:17:18,The chocolate is soooooooooooo amazing.,2,39,Chloe,xx-lc,0,2014-04-09 21:36:02,15398.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2191,5.910286e+17,2.956614e+09,61.0,0.0,0.0,0.0,1.0,1.0,2020-04-23 23:59:43,RT @LivvyAllison: If you are reading the front...,20,120,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0
2192,4.789901e+10,2.956614e+09,3.0,0.0,0.0,2.0,0.0,2.0,2020-03-19 10:54:49,RT @sophieraworth: Last night over Sweden... R...,30,137,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0
2193,5.939970e+17,2.956614e+09,0.0,0.0,0.0,0.0,1.0,0.0,2020-05-02 04:35:05,I think even super heroes should rest at weeke...,12,75,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0
2194,7.268000e+16,2.956614e+09,3.0,0.0,1.0,0.0,0.0,3.0,2020-05-01 19:38:48,@annabf @CampbellLiveNZ @UniteUnion well done,9,45,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0


In [27]:
text_nal[text_nal['name'] == 'Leanne Arker'].head(3)

Unnamed: 0,id,user_id,retweet_count,reply_count,favorite_count,num_hashtags,num_urls,num_mentions,created_at,text,special,text_lenght,name,lang,bot,subscribing_date,statuses_count
2130,5.86697e+17,2956614000.0,0.0,0.0,0.0,0.0,0.0,1.0,2020-04-12 01:07:12,@bernardchickey I've heard of something called...,7,137,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0
2131,5.601363e+17,2956614000.0,0.0,0.0,0.0,0.0,0.0,2.0,2020-01-29 18:04:45,RT @stephenfry: If physical diseases were trea...,16,110,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0
2132,5.751565e+17,2956614000.0,0.0,0.0,0.0,0.0,0.0,1.0,2020-03-11 04:49:27,@Hilary_Barry Gives new meaning to the term sl...,5,62,Leanne Arker,select language...,0,2020-01-05 00:09:59,57.0


##### Users without a language are en

In [28]:
users['lang'].replace('select language...', 'en', inplace=True)
users['lang'].replace('xx-lc', 'en', inplace=True)

## Calculate for each user how many tweets we have in tweets.csv

In [29]:

users = users.merge(tweets[['user_id', 'id']].groupby('user_id', as_index=False).count(), on='user_id')
users = users.rename(columns={'id' : 'count'})
users.head(3)



Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0,132
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0,122
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0,4


## Summing parameters in users (likes received, retweet and reply received)

In [30]:
new_tweets = tweets.filter(['user_id', 'retweet_count', 'reply_count', 'favorite_count', 'num_hashtags', 'num_urls', 'num_mentions'], axis=1)
new_users = users.merge(new_tweets.groupby('user_id').sum(), on='user_id')
new_users = new_users.rename(columns={
    'retweet_count' : 'retweet_received',
    'reply_count' : 'reply_received',
    'favorite_count' : 'favorite_received',
    'num_hashtags' : 'hashtag_used',
    'num_urls' : 'urls_used',
    'num_mentions' : 'mentions_used'}, 
    errors='raise'
)
new_users.head()



Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,hashtag_used,urls_used,mentions_used
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76.0,132,5.0,0.0,5.0,13.0,0.0,37.0
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54.0,122,3.0,0.0,6.0,4.0,0.0,41.0
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3.0,4,0.0,0.0,0.0,0.0,0.0,0.0
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50.0,1439,1190.0,0.0,248.0,92.0,36.0,9.0
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085.0,3656,62.0,0.0,226.0,475.0,3.0,1913.0


In [31]:
new_users[\
    (new_users['retweet_received'].isna() == True) |\
    (new_users['reply_received'].isna() == True) |\
    (new_users['favorite_received'].isna() == True) |\
    (new_users['hashtag_used'].isna() == True) |\
    (new_users['urls_used'].isna() == True) |\
    (new_users['mentions_used'].isna() == True)
        ].head()

Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,hashtag_used,urls_used,mentions_used


In [32]:
new_users['reply_received'] = new_users['reply_received'].fillna(new_users['reply_received'].median())
new_users['retweet_received'] = new_users['retweet_received'].fillna(new_users['retweet_received'].median())
new_users['statuses_count'] = new_users['statuses_count'].fillna(new_users['statuses_count'].median())

In [33]:
new_users['reply_received'] = new_users['reply_received'].astype(int)
new_users['retweet_received'] = new_users['retweet_received'].astype(int)
new_users['favorite_received'] = new_users['favorite_received'].astype(int)
new_users['hashtag_used'] = new_users['hashtag_used'].astype(int)
new_users['urls_used'] = new_users['urls_used'].astype(int)
new_users['mentions_used'] = new_users['mentions_used'].astype(int)
new_users['statuses_count'] = new_users['statuses_count'].astype(int)

## Ratio

In [34]:
new_users['favorite_avg'] = round(new_users['favorite_received'] / new_users['count'], 2)
new_users.head(3)

Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,hashtag_used,urls_used,mentions_used,favorite_avg
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,5,0,5,13,0,37,0.04
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,122,3,0,6,4,0,41,0.05
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3,4,0,0,0,0,0,0,0.0


In [35]:
new_users['retweet_avg'] = round(new_users['retweet_received'] / new_users['count'], 2)
new_users['reply_avg'] = round(new_users['reply_received'] / new_users['count'], 2)
new_users['urls_avg'] = round(new_users['urls_used'] / new_users['count'], 2)
new_users['mentions_avg'] = round(new_users['mentions_used'] / new_users['count'], 2)
new_users['hashtags_avg'] = round(new_users['hashtag_used'] / new_users['count'], 2)

## Avg text lenght per user

In [36]:
mean_tweets = tweets[['user_id', 'text_lenght']].groupby('user_id', as_index=False).mean()

In [37]:
mean_tweets['user_id'] = mean_tweets['user_id'].astype(int)
mean_tweets['text_lenght'] = round(mean_tweets['text_lenght'])
mean_tweets['text_lenght'] = mean_tweets['text_lenght'].astype(int)
mean_tweets = mean_tweets.rename(columns={'text_lenght':'avg_lenght'})

In [38]:
new_users = new_users.merge(mean_tweets, on='user_id')
new_users.head(2)

Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,hashtag_used,urls_used,mentions_used,favorite_avg,retweet_avg,reply_avg,urls_avg,mentions_avg,hashtags_avg,avg_lenght
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,5,0,5,13,0,37,0.04,0.04,0.0,0.0,0.28,0.1,62
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,122,3,0,6,4,0,41,0.05,0.02,0.0,0.0,0.34,0.03,69


In [41]:
date_tweets = tweets
date_tweets['year'] = date_tweets['created_at'].dt.year
date_tweets['month'] = date_tweets['created_at'].dt.month
date_tweets['day'] = date_tweets['created_at'].dt.day
date_tweets['hour'] = date_tweets['created_at'].dt.hour

In [42]:
date_years = date_tweets.groupby(by=['user_id', 'year', 'month']).count()

In [43]:
date_years['count'] = date_years['id']
date_years = date_years[['count']]
date_years.head(5)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,count
user_id,year,month,Unnamed: 3_level_1
0.0,2019,3,1
0.0,2019,9,1
0.0,2019,10,1
0.0,2020,2,1
0.0,2020,3,3


In [44]:
minimum = date_years.reset_index().groupby('user_id').min()
maximum = date_years.reset_index().groupby('user_id').max()
minimum = minimum[['year', 'month']].rename(columns={'year' : 'year_min', 'month' : 'month_min'})
maximum = maximum[['year', 'month']].rename(columns={'year' : 'year_max', 'month' : 'month_max'})

activity_period = minimum.merge(maximum, on='user_id')
activity_period['activity_period'] = (activity_period['year_max'] - activity_period['year_min'])*12 + activity_period['month_max'] - activity_period['month_min']
activity_period = activity_period.drop(columns=['year_min', 'year_max', 'month_min', 'month_max'])

In [45]:
activity_period.head()

Unnamed: 0_level_0,year_min,month_min,year_max,month_max,activity_period
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,2019,2,2020,10,20
2.0,2020,1,2020,4,3
3.0,2014,1,2019,6,65
5.0,2019,1,2020,6,17
6.0,2019,10,2019,10,0


In [46]:
activity_period = activity_period.reset_index()
new_users = new_users.merge(activity_period, on='user_id')
new_users.head()



Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,...,reply_avg,urls_avg,mentions_avg,hashtags_avg,avg_lenght,year_min,month_min,year_max,month_max,activity_period
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,5,0,5,...,0.0,0.0,0.28,0.1,62,2019,3,2019,8,5
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,122,3,0,6,...,0.0,0.0,0.34,0.03,69,2019,3,2019,8,5
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3,4,0,0,0,...,0.0,0.0,0.0,0.0,19,2016,6,2016,6,0
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1439,1190,0,248,...,0.0,0.03,0.01,0.06,87,2017,5,2019,11,30
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,3656,62,0,226,...,0.0,0.0,0.52,0.13,72,2020,1,2020,5,4


In [47]:
#activity_period = activity_period.reset_index()
date_years[date_years['user_id'] == 3]

KeyError: 'user_id'

In [48]:
g_sum = new_users['bot'].sum()
values = new_users['bot']/g_sum
new_users['Bot_Entropy'] = -(values*np.log(values))
users['Bot_Entropy'] = new_users['Bot_Entropy'].fillna(0)
users['Bot_Entropy'].sum()


  result = getattr(ufunc, method)(*inputs, **kwargs)


8.718663567048953

In [49]:
new_users.head(50)

#df1 = users.groupby('id',as_index=False,sort=False)['Count_Entropy'].sum()

Unnamed: 0,user_id,name,lang,bot,subscribing_date,statuses_count,count,retweet_received,reply_received,favorite_received,...,urls_avg,mentions_avg,hashtags_avg,avg_lenght,year_min,month_min,year_max,month_max,activity_period,Bot_Entropy
0,2353593986,Lamonica Raborn,en,1,2019-02-22 18:00:42,76,132,5,0,5,...,0.0,0.28,0.1,62,2019,3,2019,8,5,0.001426
1,2358850842,Lourie Botton,en,0,2019-02-26 03:02:32,54,122,3,0,6,...,0.0,0.34,0.03,69,2019,3,2019,8,5,
2,137959629,Dadan Syarifudin,en,1,2015-04-30 07:09:56,3,4,0,0,0,...,0.0,0.0,0.0,19,2016,6,2016,6,0,0.001426
3,466124818,Carletto Focia,it,1,2017-01-18 02:49:18,50,1439,1190,0,248,...,0.03,0.01,0.06,87,2017,5,2019,11,30,0.001426
4,2571493866,MBK Ebook,en,0,2019-06-18 19:30:21,7085,3656,62,0,226,...,0.0,0.52,0.13,72,2020,1,2020,5,4,
5,2199062688,Qq,en,0,2018-11-19 06:58:18,7406,2081,31810,0,1009,...,0.03,0.91,0.02,45,2019,1,2020,8,19,
6,1174869769,Tania Wren,en,1,2018-02-15 10:51:28,21,24,0,0,0,...,0.08,0.12,0.25,64,2018,2,2018,4,2,0.001426
7,3118659848,charlotte gray,en,0,2020-03-31 21:19:59,115,135,595,0,106,...,0.1,0.78,0.12,62,2020,4,2020,5,1,
8,616225564,Anisha Williams,en,0,2017-06-25 15:49:36,68,60,1,0,3,...,0.03,0.02,0.23,67,2017,1,2018,12,23,
9,2357425536,Meda Tatlock,en,1,2019-02-25 04:30:56,69,150,6,0,5,...,0.0,0.36,0.12,64,2019,3,2019,8,5,0.001426


In [50]:
users = new_users

## Write new file csv

In [51]:
users.to_csv('new_users.csv')