First, let's import all necessary modules

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings("ignore")

# Read the data

In [4]:
# your code here
df = pd.read_csv("data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,category,user_location_updated,user-country_code,user-country
0,0,AARPKY,Bluegrass State,Official AARP Kentucky Twitter. Get to know us...,2009-07-25 15:14:41,3266,2021-02-18 14:12:52,@3rdRetiredTeach See Kentucky's update vaccine...,0,0,False,Positive,Bluegrass State,USA,United States
1,1,GoSolar01,,"Get all your solar info here, including latest...",2017-12-14 06:26:39,1861,2021-02-18 05:35:56,RT @dr_hhq: Chipped - #crushcovid #gettheshot ...,0,1,True,Positive,,,
2,2,dr_hhq,"Karachi, Pakistan",Assistant Professor #Urology #SIUT - Half Prof...,2009-08-15 06:11:55,2378,2021-02-18 05:35:37,Chipped - #crushcovid #gettheshot 🇨🇳#covid #co...,2,1,True,Positive,"Karachi, Pakistan",PAK,Pakistan
3,3,Nakhasi_MD,"Los Angeles, CA",Doc in Compton | Policy Adviser. LA County Dep...,2009-03-11 06:04:06,2205,2021-02-18 03:58:45,"RT @PinnacleTC_Hope: Holly Broce, president of...",0,1,True,Positive,"Los Angeles, CA",USA,United States
4,4,SonLaurencio,"Kentucky, USA",Pro Executive Entrepreneur #business #marketin...,2013-04-15 02:31:40,1558,2021-02-18 03:48:33,RT @LFCHD: A COVID-19 vaccine update thread:\n...,0,1,True,Positive,"Kentucky, USA",USA,United States


In [3]:
df.drop(['Unnamed: 0', 'Unnamed: 0.1'], axis =1, inplace = True)

In [4]:
df.shape

(36696, 14)

There are 11 columns and 18491 rows. (we deleted the 2 columns because of irrelevance)

# Present summary statistics

In [5]:
df.describe()

Unnamed: 0,user_followers,favorites,retweets
count,36696.0,36696.0,36696.0
mean,6813.57,2.785971,85.690429
std,94500.24,22.43639,720.188548
min,0.0,0.0,0.0
25%,149.0,0.0,0.0
50%,610.0,0.0,3.0
75%,2238.0,0.0,16.0
max,8623795.0,1318.0,25838.0


# Present datatype per column

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 36696 entries, 0 to 36695
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   user_name              36696 non-null  object
 1   user_location          26430 non-null  object
 2   user_description       32793 non-null  object
 3   user_created           36696 non-null  object
 4   user_followers         36696 non-null  int64 
 5   date                   36696 non-null  object
 6   text                   36696 non-null  object
 7   favorites              36696 non-null  int64 
 8   retweets               36696 non-null  int64 
 9   is_retweet             36696 non-null  bool  
 10  category               36696 non-null  object
 11  user_location_updated  26235 non-null  object
 12  user-country_code      15116 non-null  object
 13  user-country           15116 non-null  object
dtypes: bool(1), int64(3), object(10)
memory usage: 3.7+ MB


# Present columns with NA

In [7]:
column_with_nulls = np.array(df.columns[df.isnull().any()])
column_with_nulls

array(['user_location', 'user_description', 'user_location_updated',
       'user-country_code', 'user-country'], dtype=object)

### Perform data integrity checks

Tweets length is more than 0

In [8]:
assert df['text'].str.len().all() > 0

Only 2 categories, Negative and positive

In [9]:
df['category'].value_counts()

Negative    19252
Positive    17444
Name: category, dtype: int64

# Define the variables

In [5]:
target = 'category'
df.columns

Index(['Unnamed: 0', 'user_name', 'user_location', 'user_description',
       'user_created', 'user_followers', 'date', 'text', 'favorites',
       'retweets', 'is_retweet', 'category', 'user_location_updated',
       'user-country_code', 'user-country'],
      dtype='object')

**Categorical:**\
user_name\
user_location\
user_description\
user_created\
date\
text\
is_retweet

**Numerical:**\
user_followers
favorites\
retweets


# Dates

In [6]:
df['user_created'] = pd.to_datetime(df['user_created'], errors = 'coerce')
# Let's check the type of each distinct value
df['user_created'].apply(type).value_counts()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>    36696
Name: user_created, dtype: int64

**GOOD!**

In [7]:
df['date'] = pd.to_datetime(df['date'], errors = 'coerce')
# Let's check the type of each distinct value
df['date'].apply(type).value_counts()

<class 'pandas._libs.tslibs.timestamps.Timestamp'>    36696
Name: date, dtype: int64

**GOOD!**

# Spliting the data

## Prepare data

In [8]:
# Let's prepare our data :
features = [feature for feature in df.columns if feature != target]

relevant_df = df[features + [target]].drop_duplicates() # making sure to drop duplicated rows
relevant_df.shape # check the shape

(36696, 15)

In [9]:
X = relevant_df[features] 
y = relevant_df[target] 

#### Perform split

In [10]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, \
                                                    random_state=42, \
                                    stratify = relevant_df[target])

#### Check the validity of our split

In [11]:
print(X_train.shape[0]/len(relevant_df), X_test.shape[0]/len(relevant_df)) # proportion train / test for the split
print(set(X_train.index) & set(X_test.index)) # are train and test datasets overlapping ? 

0.7999781992587748 0.2000218007412252
set()


#### Prepare train and test datasets for the next steps

In [12]:
train_df = pd.concat([X_train[features], y_train], axis=1)

In [13]:
test_df = pd.concat([X_test[features], y_test], axis =1)

# Missing NA's

#### Missing values - train dataset

In [14]:
column_with_nulls = np.array(train_df.columns[train_df.isnull().any()])
{col : train_df[col].isnull().sum() for col in column_with_nulls}

{'user_location': 8213,
 'user_description': 3095,
 'user_location_updated': 8373,
 'user-country_code': 17224,
 'user-country': 17224}

In [15]:
train_df[pd.isnull(train_df).any(axis=1)]

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative
10938,10938,Extranenne,Constellation du Cygne,,2014-04-10 11:05:21,417,2021-02-17 12:26:13,RT @docdoug45: #Covid19 #CovidHoax #Plandemic ...,0,1,True,Constellation du Cygne,,,Negative
16725,16725,SunshineTheGrey,"Nebraska, USA,",SHOW ME WHAT YOU GOT! THIS IS MY OPINION! \n...,2009-05-23 02:16:48,3476,2021-02-03 17:26:55,@WorldResources @RockyMtnInst What about #Fuku...,1,1,True,"Nebraska, USA,",,,Negative
28563,28563,calabrese_diane,Phila / NYC,"USCAN Modality Leader - Structural Heart, Inte...",2015-04-18 20:44:54,104,2021-01-20 02:54:28,#IGotTheShot @RowanUniversity Mega Vaccination...,2,1,True,Phila NYC,,,Positive
5222,5222,stevebell24,Scotland,,2012-03-26 17:45:45,559,2021-02-09 15:39:49,RT @indyNurseBrian: Had to continue on foot a ...,0,227,True,Scotland,GBR,United Kingdom,Positive
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20974,20974,FionaKi40696316,Everywhere and nowhere,Just listening.,2020-03-05 21:43:48,26,2021-02-08 22:21:26,RT @skepticalraptor: Were #COVID19 #vaccines r...,0,3,True,Everywhere and nowhere,,,Positive
1767,1767,sonodoc99,,"Dammit, Jim, I'm a Doctor, not a Provider. As ...",2009-10-08 13:14:30,2811,2021-02-17 18:35:06,@NYGovCuomo This is what happens when you do n...,3,1,True,,,,Positive
26218,26218,Martyk30111235,,,2020-01-27 14:03:38,186,2021-02-06 00:40:31,RT @CitizenIre: #Covidhoax\n#NewWorldOrder \n#...,0,10,True,,,,Negative
15389,15389,LSW12612672511,,🌸🌷🌺🐈🐩🐕🐶🐱🐯🦁🐼🦍🐒🐷🐧🍄🐝🦋🐵J.Woke - Anti-Zionist #OPEN...,2015-04-03 06:37:36,5288,2021-02-05 11:02:02,RT @darkagenda: Nana Akua and Dr Sarah Jarvis ...,0,6,True,,,,Negative


In [16]:
#train_df = train_df.drop(69) # delete row 69

KeyError: '[69] not found in axis'

There are too many NaNs to delete them.
I think that the best solution for the NaNs is to fill by 'other'.

In [17]:
train_df.user_description.fillna('other', inplace=True)
train_df.user_location.fillna('other', inplace=True)

In [18]:
train_df[pd.isnull(train_df).any(axis=1)] # check that there are no more null values

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative
10938,10938,Extranenne,Constellation du Cygne,other,2014-04-10 11:05:21,417,2021-02-17 12:26:13,RT @docdoug45: #Covid19 #CovidHoax #Plandemic ...,0,1,True,Constellation du Cygne,,,Negative
16725,16725,SunshineTheGrey,"Nebraska, USA,",SHOW ME WHAT YOU GOT! THIS IS MY OPINION! \n...,2009-05-23 02:16:48,3476,2021-02-03 17:26:55,@WorldResources @RockyMtnInst What about #Fuku...,1,1,True,"Nebraska, USA,",,,Negative
28563,28563,calabrese_diane,Phila / NYC,"USCAN Modality Leader - Structural Heart, Inte...",2015-04-18 20:44:54,104,2021-01-20 02:54:28,#IGotTheShot @RowanUniversity Mega Vaccination...,2,1,True,Phila NYC,,,Positive
26553,26553,FlytheStGeorge,other,“The truth is incontrovertible. Malice may att...,2017-05-23 14:10:49,929,2021-02-08 12:54:43,RT @NewtonClarkeUK: PREDICTION 1:\nThe vaccine...,0,104,True,,,,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20974,20974,FionaKi40696316,Everywhere and nowhere,Just listening.,2020-03-05 21:43:48,26,2021-02-08 22:21:26,RT @skepticalraptor: Were #COVID19 #vaccines r...,0,3,True,Everywhere and nowhere,,,Positive
1767,1767,sonodoc99,other,"Dammit, Jim, I'm a Doctor, not a Provider. As ...",2009-10-08 13:14:30,2811,2021-02-17 18:35:06,@NYGovCuomo This is what happens when you do n...,3,1,True,,,,Positive
26218,26218,Martyk30111235,other,other,2020-01-27 14:03:38,186,2021-02-06 00:40:31,RT @CitizenIre: #Covidhoax\n#NewWorldOrder \n#...,0,10,True,,,,Negative
15389,15389,LSW12612672511,other,🌸🌷🌺🐈🐩🐕🐶🐱🐯🦁🐼🦍🐒🐷🐧🍄🐝🦋🐵J.Woke - Anti-Zionist #OPEN...,2015-04-03 06:37:36,5288,2021-02-05 11:02:02,RT @darkagenda: Nana Akua and Dr Sarah Jarvis ...,0,6,True,,,,Negative


#### Missing values - test dataset

In [19]:
test_df.user_description.fillna('other', inplace=True)
test_df.user_location.fillna('other', inplace=True)

In [20]:
test_df[pd.isnull(test_df).any(axis=1)]

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category
19461,19461,KimYuMD,other,"Regional Medical Director, Aledade\nFamily Phy...",2021-01-29 05:26:33,172,2021-02-08 22:54:25,"RT @CKefalas: Hi, America: a reminder tonight’...",0,6,True,,,,Positive
33958,33958,dab7cdaf46b0463,other,"Black Country Wench, spent the last 20 years t...",2014-05-12 22:16:03,93,2021-01-21 16:07:01,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,,,,Negative
16513,16513,Paulm27202322,other,other,2019-11-25 13:28:36,40,2021-02-08 15:17:43,RT @NewtonClarkeUK: PREDICTION 1:\nThe vaccine...,0,104,True,,,,Negative
3774,3774,knock_long,other,just bullshit,2016-02-05 23:58:31,64,2021-02-18 04:32:36,"RT @ochealth: #OC, please see the press releas...",0,15,True,,,,Positive
14297,14297,trustyourhearth,other,Singer/Artist/Designer/Actress. Whistleblower ...,2020-12-21 19:36:18,80,2021-02-05 13:30:52,"Manifest the ‘Great Reset’ to be Nesara, the ‘...",0,0,False,,,,Negative
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36243,36243,jimmykkk5,UK-North East of the Tyne,Those who would give up essential liberty to p...,2020-09-22 13:03:04,168,2021-01-21 23:39:02,@mariannaspring Please define what is Antivaxx ?,1,0,False,UKNorth East of the Tyne,,,Negative
15129,15129,smoknbeaver,"Central Illinois,United States",#smoknbeaver Fun T-shirts! Get them ALL at htt...,2017-05-09 13:01:38,2238,2021-02-06 12:42:35,https://t.co/nYuETtZgCy #smoknbeaver \nSmok’n’...,0,0,False,"Central Illinois,United States",,,Negative
16613,16613,B_man1973,North West England.,Serving cop on Lancashire's Tactical Operatio...,2018-07-13 08:12:48,1169,2021-02-07 22:33:29,RT @NewtonClarkeUK: PREDICTION 1:\nThe vaccine...,0,104,True,North West England,,,Negative
5037,5037,Jeremy13605986,other,other,2019-10-07 22:02:05,264,2021-02-04 00:56:04,RT @medcoe: #GetTheShot! Ssg. Josue Rivera-Sot...,0,3,True,,,,Positive


# I think the data is too sparse, so I won't delete more observations (outliers).

# Feature engineering

## Train

Assuming that extreme opinion people tweet length will be differet (in amount manners)

In [21]:
train_df['tweet_len'] = train_df.text.str.len()
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,True,USA,USA,United States,Negative,140
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative,144
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,False,camberwell,GBR,United Kingdom,Negative,59
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,False,"Lancaster, PA",USA,United States,Positive,144
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,True,"Durham, North Carolina",USA,United States,Positive,140


Maybe people with exterme opinions would use more exclamation marks?

In [22]:
train_df["tweet_special chars"] = train_df.apply(lambda p: sum( not q.isalpha() for q in p["text"] ), axis=1)
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len,tweet_special chars
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,True,USA,USA,United States,Negative,140,35
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative,144,28
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,False,camberwell,GBR,United Kingdom,Negative,59,13
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,False,"Lancaster, PA",USA,United States,Positive,144,26
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,True,"Durham, North Carolina",USA,United States,Positive,140,34


Maybe they tend to tweet in certian days?

In [23]:
train_df['Day_of_week'] = pd.DatetimeIndex(train_df['date']).dayofweek
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len,tweet_special chars,Day_of_week
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,True,USA,USA,United States,Negative,140,35,6
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative,144,28,3
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,False,camberwell,GBR,United Kingdom,Negative,59,13,4
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,False,"Lancaster, PA",USA,United States,Positive,144,26,0
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,True,"Durham, North Carolina",USA,United States,Positive,140,34,0


The year when the user created - maybe he's a bot?

In [24]:
train_df['Year user created'] = pd.DatetimeIndex(train_df['user_created']).year
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len,tweet_special chars,Day_of_week,Year user created
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,True,USA,USA,United States,Negative,140,35,6,2010
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative,144,28,3,2010
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,False,camberwell,GBR,United Kingdom,Negative,59,13,4,2011
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,False,"Lancaster, PA",USA,United States,Positive,144,26,0,2009
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,True,"Durham, North Carolina",USA,United States,Positive,140,34,0,2013


Loud expressions in description - a sign?

In [25]:
train_df["desc_special chars"] = train_df.apply(lambda p: sum( not q.isalpha() for q in p["user_description"] ), axis=1)
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len,tweet_special chars,Day_of_week,Year user created,desc_special chars
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,True,USA,USA,United States,Negative,140,35,6,2010,28
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,"Tennessee, USA",,,Negative,144,28,3,2010,15
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,False,camberwell,GBR,United Kingdom,Negative,59,13,4,2011,21
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,False,"Lancaster, PA",USA,United States,Positive,144,26,0,2009,10
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,True,"Durham, North Carolina",USA,United States,Positive,140,34,0,2013,37


In [26]:
train_df.user_name.value_counts()[:10]
ten_most_common_names = train_df.user_name.value_counts().keys().tolist()[:10]
ten_most_common_names

['Nakhasi_MD',
 'SunshineTheGrey',
 'Cagsil',
 'DeniseVFriend',
 'lizditz',
 'ThisIsOurShot',
 'Rosewind2007',
 'drkkyu',
 'doritmi',
 'trustyourhearth']

In [27]:
train_df['ten most common names'] = train_df.apply (lambda row: 1 if row.user_name in ten_most_common_names else 0 , axis=1)
train_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,...,user_location_updated,user-country_code,user-country,category,tweet_len,tweet_special chars,Day_of_week,Year user created,desc_special chars,ten most common names
18046,18046,Cagsil,USA,"Knowledge, Experience & Wisdom Is Shared. It's...",2010-02-25 02:10:51,6299,2021-02-07 16:13:14,RT @Cagsil: Now would be extremely helpful to ...,0,7,...,USA,USA,United States,Negative,140,35,6,2010,28,1
33939,33939,TamiWami69,"Tennessee, USA","Designer, Writer, Artist, Skeptic, Lover & inf...",2010-04-22 21:04:40,347,2021-01-21 16:29:05,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,...,"Tennessee, USA",,,Negative,144,28,3,2010,15,0
17696,17696,colinsmithse5,camberwell,"Hairstylist, civilised to @RichardGse5 tall & ...",2011-02-03 19:16:40,413,2021-02-05 17:28:46,@MetroUK One for all the #antivaxers #antivax ...,0,0,...,camberwell,GBR,United Kingdom,Negative,59,13,4,2011,21,0
22446,22446,CharlesDiComo,"Lancaster, PA","Healthcare Executive, WellSpan, Geneticist, Ph...",2009-02-25 16:59:53,401,2021-02-08 22:57:22,#healthystepahead #takingmyshot #getvaccinated...,0,0,...,"Lancaster, PA",USA,United States,Positive,144,26,0,2009,10,0
21184,21184,PfineFine,"Durham, North Carolina","A believer in people. CEO, @fhi360. All tweets...",2013-04-18 00:00:33,1747,2021-02-08 17:49:19,"RT @fhi360: @PfineFine, @FHI360’s CEO, will be...",0,1,...,"Durham, North Carolina",USA,United States,Positive,140,34,0,2013,37,0


## Test

In [28]:
test_df['tweet_len'] = test_df.text.str.len()
test_df.head()

Unnamed: 0.1,Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,date,text,favorites,retweets,is_retweet,user_location_updated,user-country_code,user-country,category,tweet_len
19461,19461,KimYuMD,other,"Regional Medical Director, Aledade\nFamily Phy...",2021-01-29 05:26:33,172,2021-02-08 22:54:25,"RT @CKefalas: Hi, America: a reminder tonight’...",0,6,True,,,,Positive,140
5560,5560,TrevorMacduff,Scotland,For Independent Scotland. Communications for S...,2020-02-10 09:58:55,31,2021-02-09 07:36:33,RT @indyNurseBrian: Had to continue on foot a ...,0,227,True,Scotland,GBR,United Kingdom,Positive,140
33958,33958,dab7cdaf46b0463,other,"Black Country Wench, spent the last 20 years t...",2014-05-12 22:16:03,93,2021-01-21 16:07:01,RT @NickHudsonCT: Raising conflicts of interes...,0,1758,True,,,,Negative,144
16513,16513,Paulm27202322,other,other,2019-11-25 13:28:36,40,2021-02-08 15:17:43,RT @NewtonClarkeUK: PREDICTION 1:\nThe vaccine...,0,104,True,,,,Negative,140
3689,3689,ShrinersStLouis,"St. Louis, Missouri","Pediatric orthopaedic care, regardless of a fa...",2009-02-27 20:03:23,1480,2021-02-10 14:42:18,We're helping K.O. COVID! The second round of ...,10,1,True,"St Louis, Missouri",USA,United States,Positive,140


In [None]:
test_df["tweet_special chars"] = test_df.apply(lambda p: sum( not q.isalpha() for q in p["text"] ), axis=1)
test_df.head()

In [None]:
test_df['Day_of_week'] = pd.DatetimeIndex(test_df['date']).dayofweek
test_df.head()

In [None]:
test_df['Year user created'] = pd.DatetimeIndex(test_df['user_created']).year
test_df.head()

In [None]:
test_df["desc_special chars"] = test_df.apply(lambda p: sum( not q.isalpha() for q in p["user_description"] ), axis=1)
test_df.head()

In [None]:
test_df['ten most common names'] = test_df.apply (lambda row: 1 if row.user_name in ten_most_common_names else 0 , axis=1)
test_df.head()

# Preproccesing

In [None]:
train_df.columns

In [None]:
selected = ['user_location', 'user_followers', 'favorites', 'retweets', 'is_retweet',
       'category', 'tweet_len', 'tweet_special chars', 'Day_of_week',
       'Year user created', 'desc_special chars', 'ten most common names']
train_df = train_df[selected]
test_df = test_df[selected]

In [None]:
value_cnts = train_df.user_location.value_counts()
to_other = value_cnts[(value_cnts <= 11)].keys().tolist()
train_df.user_location[train_df.user_location.isin(to_other)] = 'other'

In [None]:
train_df = pd.get_dummies(data=train_df, columns=['user_location'])
train_df.shape

In [None]:
test_df.user_location[test_df.user_location.isin(to_other)] = 'other'

In [None]:
test_df = pd.get_dummies(data=test_df, columns=['user_location'])
test_df.shape

In [None]:
train_df.is_retweet[train_df.is_retweet == True] = 1
train_df.is_retweet[train_df.is_retweet == False] = 0

train_df.category[train_df.category == 'Positive'] = 0
train_df.category[train_df.category == 'Negative'] = 1

train_df.head()

In [None]:
test_df.is_retweet[test_df.is_retweet == True] = 1
test_df.is_retweet[test_df.is_retweet == False] = 0

test_df.category[test_df.category == 'Positive'] = 0
test_df.category[test_df.category == 'Negative'] = 1

test_df.head()

# Remove high correlated columns

In [None]:
def get_redundant_binary_cols(data, threshold):
    '''
    This function gets a pandas DataFrame, and a threshold (% of data), 
    and returns the list of binary features from that dataframe that have 
    above threshold amount of samples with the same value, and thus
    should be considered redundant. 
    '''
    
    bool_cols = [col for col in data.columns \
                 if pd.Series(list(set(data[col]))).isin([0, 1]).all()]
    
    
    return [x for x in bool_cols if data[x].value_counts()[0] \
                                                > threshold * len(X_train)] + \
                    [x for x in bool_cols if data[x].value_counts()[1] \
                                                > threshold * len(X_train)]

In [None]:
to_rm = get_redundant_binary_cols(train_df, 0.9)

In [None]:
train_df.drop(to_rm, axis=1, inplace=True)
print(train_df.shape)
train_df.head()

In [None]:
to_rm = get_redundant_binary_cols(test_df, 0.9)

In [None]:
test_df.drop(to_rm, axis=1, inplace=True)
print(test_df.shape)
test_df.head()

# Making sure train and test are equal

In [None]:
features_test = list()
for f in test_df.columns:
    if f in train_df.columns:
        features_test.append(f)
test_df = test_df[features_test]
test_df.shape

# Spliting again to target and features

In [None]:
# Redividing train and test sets 
target = 'category'
features = [x for x in train_df.columns if x != target]

In [None]:
X_train = train_df[features]
y_train = train_df[target].astype('int')

X_test = test_df[features]
y_test = test_df[target].astype('int')

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

# Model

In [None]:
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(random_state=0)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

just checkig - always paranoid..

In [None]:
np.unique(y_pred)

That looks fine!

In [None]:
from sklearn.metrics import roc_curve, roc_auc_score

y_pred_probas = clf.predict_proba(X_test)[::,1]
fpr, tpr, _ = roc_curve(y_test,  y_pred_probas)
auc = roc_auc_score(y_test, y_pred_probas)

In [None]:
fig = plt.figure(figsize=(8,6))

plt.plot(fpr, tpr, label="AUC={:.3f}".format(auc))
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.show()

Not that impressive results.. Let's see the full picture:

In [None]:
from sklearn.metrics import classification_report

print(classification_report(y_test, y_pred))

from sklearn.metrics import plot_confusion_matrix
import numpy as np
import matplotlib.pyplot as plt

disp = plot_confusion_matrix(clf, X_test, y_test, cmap=plt.cm.Blues)
title = "Confusion matrix"
disp.ax_.set_title(title)

print(title)
print(disp.confusion_matrix)

plt.show()

from sklearn.metrics import accuracy_score
accu = accuracy_score(y_test, y_pred)
print(f"Accuracy:\n{accu}")

In [None]:
y_pred_train = clf.predict(X_train)
accu_train = accuracy_score(y_train, y_pred_train)
print(f"Accuracy train:\n{accu_train}")

# Comparison with other models

In [None]:
fpr_nofar = np.array([0.        , 0.        , 0.        , 0.        , 0.        ,
       0.        , 0.        , 0.00157233, 0.00157233, 0.00157233,
       0.00157233, 0.00157233, 0.00157233, 0.00314465, 0.00314465,
       0.00471698, 0.00471698, 0.00628931, 0.00628931, 0.00786164,
       0.00786164, 0.00943396, 0.00943396, 0.01100629, 0.01100629,
       0.01257862, 0.01257862, 0.01415094, 0.01415094, 0.01572327,
       0.01572327, 0.0172956 , 0.0172956 , 0.01886792, 0.01886792,
       0.02044025, 0.02044025, 0.02201258, 0.02201258, 0.02358491,
       0.02358491, 0.02515723, 0.02515723, 0.02672956, 0.02672956,
       0.02830189, 0.02830189, 0.02987421, 0.02987421, 0.03144654,
       0.03144654, 0.03301887, 0.03301887, 0.03459119, 0.03459119,
       0.03616352, 0.03616352, 0.03773585, 0.03773585, 0.0408805 ,
       0.0408805 , 0.04245283, 0.04245283, 0.04402516, 0.04402516,
       0.04559748, 0.04559748, 0.04716981, 0.04716981, 0.04874214,
       0.04874214, 0.05031447, 0.05031447, 0.05345912, 0.05345912,
       0.05660377, 0.05660377, 0.0581761 , 0.0581761 , 0.06289308,
       0.06289308, 0.06446541, 0.06446541, 0.06603774, 0.06603774,
       0.06918239, 0.06918239, 0.07075472, 0.07075472, 0.08176101,
       0.08176101, 0.08333333, 0.08333333, 0.08962264, 0.08962264,
       0.0927673 , 0.0927673 , 0.09748428, 0.09748428, 0.10220126,
       0.10220126, 0.10534591, 0.10534591, 0.12264151, 0.12264151,
       0.12421384, 0.12421384, 0.13050314, 0.13050314, 0.13522013,
       0.13522013, 0.13679245, 0.13679245, 0.15566038, 0.15566038,
       0.17138365, 0.17138365, 0.18081761, 0.18081761, 0.19025157,
       0.19025157, 0.2091195 , 0.2091195 , 0.2327044 , 0.2327044 ,
       0.23899371, 0.23899371, 0.29716981, 0.29716981, 0.43867925,
       0.4418239 , 0.81918239, 0.82232704, 0.82861635, 0.83176101,
       0.84119497, 0.84433962, 0.87421384, 0.87735849, 0.8836478 ,
       0.88679245, 0.88836478, 0.89150943, 0.90251572, 0.90566038,
       0.9072327 , 0.91352201, 0.91509434, 0.91823899, 1.        ])
tpr_nofar = np.array([0.00000000e+00, 6.93481276e-04, 1.00554785e-01, 1.01941748e-01,
       1.55339806e-01, 1.56726768e-01, 2.62829404e-01, 2.62829404e-01,
       3.68932039e-01, 3.70319001e-01, 4.77115118e-01, 4.78502080e-01,
       5.58945908e-01, 5.58945908e-01, 8.19001387e-01, 8.19001387e-01,
       8.23162275e-01, 8.23162275e-01, 8.32871012e-01, 8.32871012e-01,
       8.59916782e-01, 8.59916782e-01, 8.64077670e-01, 8.64077670e-01,
       8.77253814e-01, 8.77253814e-01, 8.98751734e-01, 8.98751734e-01,
       9.04299584e-01, 9.04299584e-01, 9.11234397e-01, 9.11234397e-01,
       9.11927878e-01, 9.11927878e-01, 9.12621359e-01, 9.12621359e-01,
       9.16782247e-01, 9.16782247e-01, 9.19556172e-01, 9.19556172e-01,
       9.25797503e-01, 9.25797503e-01, 9.28571429e-01, 9.28571429e-01,
       9.29958391e-01, 9.29958391e-01, 9.34812760e-01, 9.34812760e-01,
       9.38280166e-01, 9.38280166e-01, 9.43134535e-01, 9.43134535e-01,
       9.45908460e-01, 9.45908460e-01, 9.48682386e-01, 9.48682386e-01,
       9.50069348e-01, 9.50069348e-01, 9.50762829e-01, 9.50762829e-01,
       9.52149792e-01, 9.52149792e-01, 9.54923717e-01, 9.54923717e-01,
       9.56310680e-01, 9.56310680e-01, 9.59778086e-01, 9.59778086e-01,
       9.61858530e-01, 9.61858530e-01, 9.65325936e-01, 9.65325936e-01,
       9.68099861e-01, 9.68099861e-01, 9.68793343e-01, 9.68793343e-01,
       9.70180305e-01, 9.70180305e-01, 9.74341193e-01, 9.74341193e-01,
       9.75034674e-01, 9.75034674e-01, 9.75728155e-01, 9.75728155e-01,
       9.76421637e-01, 9.76421637e-01, 9.78502080e-01, 9.78502080e-01,
       9.80582524e-01, 9.80582524e-01, 9.81276006e-01, 9.81276006e-01,
       9.81969487e-01, 9.81969487e-01, 9.83356449e-01, 9.83356449e-01,
       9.84049931e-01, 9.84049931e-01, 9.85436893e-01, 9.85436893e-01,
       9.86130374e-01, 9.86130374e-01, 9.87517337e-01, 9.87517337e-01,
       9.88210818e-01, 9.88210818e-01, 9.90291262e-01, 9.90291262e-01,
       9.91678225e-01, 9.91678225e-01, 9.92371706e-01, 9.92371706e-01,
       9.93065187e-01, 9.93065187e-01, 9.94452150e-01, 9.94452150e-01,
       9.95145631e-01, 9.95145631e-01, 9.95839112e-01, 9.95839112e-01,
       9.96532594e-01, 9.96532594e-01, 9.97919556e-01, 9.97919556e-01,
       9.98613037e-01, 9.98613037e-01, 9.99306519e-01, 9.99306519e-01,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00, 1.00000000e+00, 1.00000000e+00,
       1.00000000e+00, 1.00000000e+00])
auc_nofar = 0.9923313619274418

In [None]:
fpr_yaniv = np.array([0.        , 0.        , 0.        , 0.00127389, 0.00127389,
       0.00254777, 0.00254777, 0.00254777, 0.00254777, 0.00382166,
       0.00382166, 0.00509554, 0.00509554, 0.00636943, 0.00636943,
       0.00636943, 0.00636943, 0.00764331, 0.00764331, 0.01019108,
       0.01019108, 0.01146497, 0.01146497, 0.01273885, 0.01273885,
       0.01528662, 0.01528662, 0.01656051, 0.01656051, 0.01783439,
       0.01783439, 0.01910828, 0.01910828, 0.02038217, 0.02038217,
       0.02292994, 0.02292994, 0.02420382, 0.02420382, 0.02547771,
       0.02547771, 0.02675159, 0.02675159, 0.0343949 , 0.0343949 ,
       0.03566879, 0.03566879, 0.03821656, 0.03821656, 0.03949045,
       0.03949045, 0.04076433, 0.04076433, 0.04458599, 0.04458599,
       0.04585987, 0.04585987, 0.04968153, 0.04968153, 0.05095541,
       0.05095541, 0.0522293 , 0.0522293 , 0.05350318, 0.05350318,
       0.05987261, 0.05987261, 0.06624204, 0.06624204, 0.07006369,
       0.07006369, 0.07388535, 0.07388535, 0.07770701, 0.07770701,
       0.08152866, 0.08152866, 0.08535032, 0.08535032, 0.09426752,
       0.09426752, 0.10063694, 0.10063694, 0.1044586 , 0.1044586 ,
       0.11719745, 0.11847134, 0.12484076, 0.12484076, 0.1388535 ,
       0.1388535 , 0.15414013, 0.15414013, 0.17452229, 0.17452229,
       0.20254777, 0.20254777, 0.22292994, 0.22292994, 0.24203822,
       0.24203822, 0.25095541, 0.25095541, 0.36178344, 0.36178344,
       0.4089172 , 0.4089172 , 0.4522293 , 0.4522293 , 0.45605096,
       0.45605096, 0.4955414 , 0.49808917, 0.54649682, 0.54904459,
       0.58471338, 0.58471338, 0.58726115, 0.61273885, 0.61528662,
       0.65477707, 0.65477707, 0.75031847, 0.7566879 , 0.89171975,
       0.89426752, 0.93121019, 0.93375796, 0.9910828 , 0.99617834,
       1.        ])
tpr_yaniv = np.array([0.        , 0.00341297, 0.02389078, 0.02730375, 0.04095563,
       0.04095563, 0.25938567, 0.2662116 , 0.29010239, 0.29010239,
       0.38225256, 0.38225256, 0.46075085, 0.46075085, 0.52559727,
       0.53242321, 0.56313993, 0.56313993, 0.57337884, 0.57337884,
       0.59726962, 0.59726962, 0.60409556, 0.60409556, 0.67235495,
       0.67235495, 0.67576792, 0.67576792, 0.6894198 , 0.6894198 ,
       0.76109215, 0.76109215, 0.77474403, 0.77474403, 0.78498294,
       0.78498294, 0.79180887, 0.79180887, 0.80887372, 0.80887372,
       0.81911263, 0.81911263, 0.82593857, 0.82593857, 0.82935154,
       0.82935154, 0.83959044, 0.83959044, 0.84300341, 0.84300341,
       0.85324232, 0.85324232, 0.8668942 , 0.8668942 , 0.87030717,
       0.87030717, 0.87372014, 0.87372014, 0.87713311, 0.87713311,
       0.88054608, 0.88054608, 0.88395904, 0.88395904, 0.89078498,
       0.89078498, 0.89419795, 0.89419795, 0.89761092, 0.89761092,
       0.90102389, 0.90102389, 0.91467577, 0.91467577, 0.91808874,
       0.91808874, 0.93174061, 0.93174061, 0.93856655, 0.93856655,
       0.94197952, 0.94197952, 0.94539249, 0.94539249, 0.94880546,
       0.94880546, 0.95221843, 0.95221843, 0.9556314 , 0.9556314 ,
       0.95904437, 0.95904437, 0.96245734, 0.96245734, 0.96587031,
       0.96587031, 0.96928328, 0.96928328, 0.97269625, 0.97269625,
       0.97610922, 0.97610922, 0.97952218, 0.97952218, 0.98293515,
       0.98293515, 0.98634812, 0.98634812, 0.98976109, 0.98976109,
       0.99317406, 0.99317406, 0.99317406, 0.99317406, 0.99317406,
       0.99317406, 0.99658703, 0.99658703, 0.99658703, 0.99658703,
       0.99658703, 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        ])
auc_yaniv = 0.9706832460163908

In [None]:
fpr_neta = np.array([0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 0.00000000e+00, 0.00000000e+00, 0.00000000e+00,
       0.00000000e+00, 6.88231246e-04, 6.88231246e-04, 6.88231246e-04,
       6.88231246e-04, 6.88231246e-04, 6.88231246e-04, 6.88231246e-04,
       6.88231246e-04, 6.88231246e-04, 6.88231246e-04, 1.37646249e-03,
       1.37646249e-03, 2.06469374e-03, 2.06469374e-03, 2.06469374e-03,
       2.06469374e-03, 4.12938747e-03, 4.12938747e-03, 5.50584997e-03,
       5.50584997e-03, 6.19408121e-03, 6.19408121e-03, 6.88231246e-03,
       6.88231246e-03, 7.57054370e-03, 7.57054370e-03, 8.25877495e-03,
       8.25877495e-03, 8.94700619e-03, 8.94700619e-03, 9.63523744e-03,
       9.63523744e-03, 9.63523744e-03, 9.63523744e-03, 1.10116999e-02,
       1.10116999e-02, 1.16999312e-02, 1.16999312e-02, 1.30763937e-02,
       1.30763937e-02, 1.37646249e-02, 1.37646249e-02, 1.51410874e-02,
       1.51410874e-02, 1.58293187e-02, 1.58293187e-02, 1.58293187e-02,
       1.65175499e-02, 1.65175499e-02, 1.72057811e-02, 1.72057811e-02,
       1.85822436e-02, 1.85822436e-02, 1.92704749e-02, 1.92704749e-02,
       1.99587061e-02, 1.99587061e-02, 2.13351686e-02, 2.13351686e-02,
       2.33998624e-02, 2.33998624e-02, 2.47763248e-02, 2.47763248e-02,
       2.61527873e-02, 2.61527873e-02, 2.75292498e-02, 2.95939436e-02,
       2.95939436e-02, 3.09704061e-02, 3.09704061e-02, 3.57880248e-02,
       3.57880248e-02, 3.71644873e-02, 3.71644873e-02, 3.78527185e-02,
       3.78527185e-02, 3.85409498e-02, 3.85409498e-02, 3.92291810e-02,
       3.92291810e-02, 4.06056435e-02, 4.06056435e-02, 4.47350310e-02,
       4.47350310e-02, 4.47350310e-02, 4.54232622e-02, 4.54232622e-02,
       4.61114935e-02, 4.61114935e-02, 4.81761872e-02, 4.81761872e-02,
       5.02408809e-02, 5.02408809e-02, 5.09291122e-02, 5.09291122e-02,
       5.43702684e-02, 5.43702684e-02, 5.50584997e-02, 5.50584997e-02,
       5.57467309e-02, 5.57467309e-02, 5.98761184e-02, 5.98761184e-02,
       6.05643496e-02, 6.19408121e-02, 6.19408121e-02, 6.26290434e-02,
       6.26290434e-02, 6.33172746e-02, 6.33172746e-02, 6.46937371e-02,
       6.46937371e-02, 6.67584308e-02, 6.67584308e-02, 6.74466621e-02,
       6.74466621e-02, 7.01995871e-02, 7.01995871e-02, 7.15760496e-02,
       7.15760496e-02, 7.43289745e-02, 7.43289745e-02, 7.43289745e-02,
       7.50172058e-02, 7.50172058e-02, 7.70818995e-02, 7.70818995e-02,
       8.05230557e-02, 8.05230557e-02, 8.46524432e-02, 8.46524432e-02,
       8.53406745e-02, 8.53406745e-02, 8.60289057e-02, 8.60289057e-02,
       8.94700619e-02, 8.94700619e-02, 9.08465244e-02, 9.49759119e-02,
       9.49759119e-02, 9.70406056e-02, 9.70406056e-02, 9.77288369e-02,
       9.77288369e-02, 9.91052994e-02, 1.00481762e-01, 1.07364074e-01,
       1.07364074e-01, 1.08740537e-01, 1.10116999e-01, 1.12869924e-01,
       1.12869924e-01, 1.19752237e-01, 1.19752237e-01, 1.20440468e-01,
       1.20440468e-01, 1.26634549e-01, 1.26634549e-01, 1.29387474e-01,
       1.29387474e-01, 1.41775637e-01, 1.41775637e-01, 1.42463868e-01,
       1.42463868e-01, 1.46593255e-01, 1.46593255e-01, 1.52099105e-01,
       1.52099105e-01, 1.52787337e-01, 1.52787337e-01, 1.54163799e-01,
       1.54163799e-01, 1.55540262e-01, 1.56916724e-01, 1.56916724e-01,
       1.58981418e-01, 1.58981418e-01, 1.59669649e-01, 1.61046111e-01,
       1.64487268e-01, 1.64487268e-01, 1.67240193e-01, 1.67240193e-01,
       1.72746043e-01, 1.72746043e-01, 1.74122505e-01, 1.74122505e-01,
       1.83757743e-01, 1.83757743e-01, 1.85134205e-01, 1.91328286e-01,
       1.91328286e-01, 1.92704749e-01, 1.93392980e-01, 1.93392980e-01,
       2.09222299e-01, 2.09222299e-01, 2.13351686e-01, 2.13351686e-01,
       2.14039917e-01, 2.14039917e-01, 2.17481074e-01, 2.17481074e-01,
       2.21610461e-01, 2.21610461e-01, 2.24363386e-01, 2.24363386e-01,
       2.25051617e-01, 2.25051617e-01, 2.28492774e-01, 2.28492774e-01,
       2.31933930e-01, 2.31933930e-01, 2.32622161e-01, 2.32622161e-01,
       2.35375086e-01, 2.36751549e-01, 2.37439780e-01, 2.37439780e-01,
       2.38816242e-01, 2.38816242e-01, 2.44322092e-01, 2.45698555e-01,
       2.56710255e-01, 2.58086717e-01, 2.58774948e-01, 2.60151411e-01,
       2.63592567e-01, 2.64969030e-01, 2.84239504e-01, 2.84239504e-01,
       3.28974535e-01, 3.30350998e-01, 3.34480385e-01, 3.35856848e-01,
       3.42739160e-01, 3.44115623e-01, 3.68891948e-01, 3.70268410e-01,
       3.72333104e-01, 3.72333104e-01, 3.74397798e-01, 3.74397798e-01,
       3.74397798e-01, 3.77838954e-01, 3.79215416e-01, 3.92980041e-01,
       3.96421198e-01, 3.97797660e-01, 4.01927047e-01, 4.01927047e-01,
       4.03303510e-01, 4.03991741e-01, 4.05368204e-01, 4.10874054e-01,
       4.10874054e-01, 4.33585685e-01, 4.33585685e-01, 4.71438403e-01,
       4.72814866e-01, 4.77632485e-01, 4.79697178e-01, 4.81073641e-01,
       4.84514797e-01, 4.84514797e-01, 5.12732278e-01, 5.12732278e-01,
       5.18238128e-01, 5.19614591e-01, 5.20991053e-01, 5.22367515e-01,
       5.33379215e-01, 5.34755678e-01, 5.45079147e-01, 5.47143840e-01,
       5.49896765e-01, 5.98761184e-01, 6.00137646e-01, 6.63454921e-01,
       6.64831383e-01, 6.82725396e-01, 6.84101858e-01, 6.92360633e-01,
       6.93737096e-01, 7.02684102e-01, 7.04748796e-01, 7.14384033e-01,
       7.15760496e-01, 7.17136958e-01, 7.18513421e-01, 7.23331039e-01,
       7.25395733e-01, 7.29525120e-01, 7.30901583e-01, 7.42601514e-01,
       7.43977977e-01, 7.52236752e-01, 7.53613214e-01, 7.77701308e-01,
       7.79077770e-01, 7.82518926e-01, 7.84583620e-01, 7.95595320e-01,
       7.96971783e-01, 8.05230557e-01, 8.06607020e-01, 8.10736407e-01,
       8.12112870e-01, 8.12801101e-01, 8.14177564e-01, 8.14865795e-01,
       8.16242257e-01, 8.22436339e-01, 8.23812801e-01, 8.47212663e-01,
       8.49277357e-01, 8.53406745e-01, 8.54783207e-01, 8.63041982e-01,
       8.67171370e-01, 8.67859601e-01, 8.71300757e-01, 8.71988988e-01,
       8.73365451e-01, 8.76118376e-01, 8.77494838e-01, 8.78871301e-01,
       8.82312457e-01, 8.83688919e-01, 8.85065382e-01, 8.86441844e-01,
       8.87130076e-01, 8.88506538e-01, 8.92635926e-01, 8.94700619e-01,
       8.97453544e-01, 9.02271163e-01, 9.02959394e-01, 9.04335857e-01,
       9.09153476e-01, 9.10529938e-01, 9.22918100e-01, 9.24294563e-01,
       9.33241569e-01, 9.34618032e-01, 9.35306263e-01, 9.37370957e-01,
       9.40123882e-01, 9.43565038e-01, 9.44941500e-01, 9.48382657e-01,
       9.50447350e-01, 9.53888507e-01, 9.55953200e-01, 9.60082588e-01,
       9.62147281e-01, 9.63523744e-01, 9.65588438e-01, 9.66964900e-01,
       9.70406056e-01, 9.72470750e-01, 9.73158981e-01, 9.88300069e-01,
       9.92429456e-01, 9.93805919e-01, 9.97247075e-01, 1.00000000e+00]) 
tpr_neta = np.array([0.        , 0.00155763, 0.01713396, 0.02180685, 0.02492212,
       0.02959502, 0.03271028, 0.03582555, 0.04205607, 0.04361371,
       0.04672897, 0.0529595 , 0.05607477, 0.07320872, 0.07632399,
       0.07788162, 0.08255452, 0.08411215, 0.09190031, 0.09501558,
       0.11682243, 0.12616822, 0.13395639, 0.13551402, 0.13862928,
       0.14330218, 0.14797508, 0.15109034, 0.15264798, 0.17757009,
       0.18068536, 0.18224299, 0.20404984, 0.24922118, 0.25077882,
       0.25389408, 0.25856698, 0.26323988, 0.27102804, 0.2834891 ,
       0.30373832, 0.31308411, 0.31619938, 0.32087227, 0.32554517,
       0.33021807, 0.33489097, 0.33800623, 0.34267913, 0.35202492,
       0.35358255, 0.35669782, 0.36292835, 0.36604361, 0.36760125,
       0.37383178, 0.38317757, 0.38785047, 0.39875389, 0.40498442,
       0.40965732, 0.41277259, 0.44080997, 0.44392523, 0.47507788,
       0.48286604, 0.5       , 0.5046729 , 0.51557632, 0.52024922,
       0.52336449, 0.52647975, 0.52959502, 0.53426791, 0.53738318,
       0.57476636, 0.58099688, 0.58566978, 0.58878505, 0.59657321,
       0.59968847, 0.59968847, 0.605919  , 0.60903427, 0.61838006,
       0.62149533, 0.62616822, 0.62928349, 0.63395639, 0.63707165,
       0.6588785 , 0.6588785 , 0.67445483, 0.67445483, 0.67601246,
       0.67912773, 0.70249221, 0.70249221, 0.71962617, 0.71962617,
       0.72274143, 0.72274143, 0.7258567 , 0.7258567 , 0.73520249,
       0.73520249, 0.74299065, 0.74299065, 0.74766355, 0.74766355,
       0.75389408, 0.75389408, 0.75700935, 0.76012461, 0.76323988,
       0.76323988, 0.76635514, 0.76635514, 0.76791277, 0.76791277,
       0.77102804, 0.77102804, 0.77570093, 0.77570093, 0.78037383,
       0.78037383, 0.78193146, 0.78504673, 0.78504673, 0.79283489,
       0.79283489, 0.79439252, 0.79439252, 0.79595016, 0.79750779,
       0.80218069, 0.80218069, 0.80996885, 0.80996885, 0.81931464,
       0.81931464, 0.82398754, 0.82398754, 0.82554517, 0.82554517,
       0.8271028 , 0.8271028 , 0.8271028 , 0.82866044, 0.82866044,
       0.83021807, 0.83021807, 0.8317757 , 0.8317757 , 0.83333333,
       0.83333333, 0.83489097, 0.83489097, 0.83800623, 0.83800623,
       0.83956386, 0.83956386, 0.8411215 , 0.8411215 , 0.84423676,
       0.84735202, 0.84735202, 0.85202492, 0.85202492, 0.85514019,
       0.85514019, 0.85669782, 0.85669782, 0.85825545, 0.85981308,
       0.86137072, 0.86137072, 0.86604361, 0.86604361, 0.86915888,
       0.86915888, 0.87071651, 0.87071651, 0.87538941, 0.87538941,
       0.87538941, 0.87850467, 0.87850467, 0.88161994, 0.88161994,
       0.88317757, 0.88317757, 0.8847352 , 0.8847352 , 0.88629283,
       0.88629283, 0.88785047, 0.88785047, 0.8894081 , 0.8894081 ,
       0.89252336, 0.89252336, 0.89563863, 0.89875389, 0.89875389,
       0.90031153, 0.90031153, 0.90186916, 0.90186916, 0.90654206,
       0.90654206, 0.90809969, 0.90809969, 0.90965732, 0.90965732,
       0.91121495, 0.91121495, 0.91277259, 0.91277259, 0.91277259,
       0.91433022, 0.91433022, 0.91744548, 0.91744548, 0.91900312,
       0.91900312, 0.91900312, 0.91900312, 0.92056075, 0.92056075,
       0.92056075, 0.92056075, 0.92367601, 0.92367601, 0.92679128,
       0.92679128, 0.92990654, 0.92990654, 0.93146417, 0.93146417,
       0.93457944, 0.93457944, 0.93613707, 0.93613707, 0.9376947 ,
       0.9376947 , 0.93925234, 0.93925234, 0.9423676 , 0.9423676 ,
       0.94548287, 0.94548287, 0.9470405 , 0.9470405 , 0.9470405 ,
       0.95015576, 0.95015576, 0.9517134 , 0.9517134 , 0.9517134 ,
       0.9517134 , 0.95327103, 0.95327103, 0.95482866, 0.95482866,
       0.95638629, 0.95638629, 0.95950156, 0.95950156, 0.96105919,
       0.96105919, 0.96105919, 0.96261682, 0.96261682, 0.96261682,
       0.96417445, 0.96417445, 0.96573209, 0.96573209, 0.96728972,
       0.96728972, 0.96884735, 0.96884735, 0.97040498, 0.97040498,
       0.97196262, 0.97196262, 0.97352025, 0.97352025, 0.97507788,
       0.97507788, 0.97663551, 0.97663551, 0.97819315, 0.97819315,
       0.97975078, 0.97975078, 0.97975078, 0.97975078, 0.98130841,
       0.98130841, 0.98286604, 0.98286604, 0.98286604, 0.98286604,
       0.98286604, 0.98286604, 0.98286604, 0.98286604, 0.98286604,
       0.98286604, 0.98442368, 0.98442368, 0.98442368, 0.98442368,
       0.98442368, 0.98442368, 0.98442368, 0.98442368, 0.98442368,
       0.98442368, 0.98598131, 0.98598131, 0.98909657, 0.99065421,
       0.99065421, 0.99065421, 0.99065421, 0.99065421, 0.99065421,
       0.99065421, 0.99376947, 0.99376947, 0.99376947, 0.99376947,
       0.99376947, 0.9953271 , 0.9953271 , 0.99688474, 0.99688474,
       0.99688474, 0.99688474, 0.99688474, 0.99688474, 0.99688474,
       0.99844237, 0.99844237, 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        , 1.        , 1.        ,
       1.        , 1.        , 1.        ])
auc_neta = 0.9749513842881737 

In [None]:
fpr_base = np.array([0.        , 0.00111235, 0.00556174, 0.00556174, 0.00556174,
       0.00556174, 0.00556174, 0.00556174, 0.00556174, 0.00556174,
       0.00778643, 0.00778643, 0.00778643, 0.00778643, 0.00778643,
       0.00778643, 0.00778643, 0.00778643, 0.00778643, 0.00778643,
       0.00778643, 0.00778643, 0.00778643, 0.00778643, 0.00778643,
       0.00778643, 0.00778643, 0.00778643, 0.00778643, 0.00778643,
       0.00778643, 0.00778643, 0.00778643, 0.00889878, 0.00889878,
       0.00889878, 0.00889878, 0.00889878, 0.00889878, 0.01001112,
       0.01001112, 0.02113459, 0.02113459, 0.02335929, 0.02335929,
       0.02335929, 0.02447164, 0.02447164, 0.02558398, 0.02558398,
       0.02780868, 0.02780868, 0.02892102, 0.02892102, 0.03003337,
       0.03003337, 0.03337041, 0.03337041, 0.0378198 , 0.0378198 ,
       0.04004449, 0.04004449, 0.04004449, 0.04115684, 0.04115684,
       0.04449388, 0.04449388, 0.04560623, 0.04560623, 0.04783092,
       0.04783092, 0.04894327, 0.04894327, 0.05116796, 0.05116796,
       0.05228031, 0.05339266, 0.05561735, 0.05561735, 0.05561735,
       0.0567297 , 0.0567297 , 0.05784205, 0.05784205, 0.06006674,
       0.06006674, 0.06229143, 0.06229143, 0.06229143, 0.06451613,
       0.06451613, 0.06451613, 0.06562848, 0.06562848, 0.06785317,
       0.06785317, 0.06785317, 0.06896552, 0.06896552, 0.07119021,
       0.07119021, 0.07230256, 0.07230256, 0.07341491, 0.07341491,
       0.07341491, 0.07452725, 0.07452725, 0.07675195, 0.07675195,
       0.07675195, 0.07786429, 0.07786429, 0.08008899, 0.08008899,
       0.08120133, 0.08120133, 0.08231368, 0.08231368, 0.09232481,
       0.09232481, 0.0945495 , 0.0945495 , 0.09566185, 0.09566185,
       0.09788654, 0.09788654, 0.09899889, 0.09899889, 0.09899889,
       0.09899889, 0.10011123, 0.10011123, 0.10122358, 0.10122358,
       0.10233593, 0.10233593, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.10456062, 0.10456062, 0.10456062,
       0.10456062, 0.10456062, 0.14238042, 0.14238042, 0.14794216,
       0.14794216, 0.14794216, 0.14794216, 0.14794216, 0.14794216,
       0.14794216, 0.14794216, 0.14794216, 0.14794216, 0.14794216,
       0.14794216, 0.14794216, 0.14794216, 0.14794216, 0.14794216,
       0.14794216, 0.14794216, 0.14794216, 0.14794216, 0.15239155,
       0.15239155, 0.15239155, 0.15239155, 0.15239155, 0.15239155,
       0.15350389, 0.15350389, 0.15350389, 0.16462736, 0.16462736,
       0.17463849, 0.17463849, 0.20022247, 0.20022247, 0.21802002,
       0.21802002, 0.21913237, 0.21913237, 0.22024472, 0.22024472,
       0.22135706, 0.22358176, 0.23136819, 0.23136819, 0.23359288,
       0.23359288, 0.23581758, 0.23581758, 0.23804227, 0.23804227,
       0.23915462, 0.23915462, 0.243604  , 0.243604  , 0.24471635,
       0.24471635, 0.24694105, 0.24694105, 0.25027809, 0.25027809,
       0.25139043, 0.25139043, 0.25250278, 0.25250278, 0.25361513,
       0.25361513, 0.25695217, 0.25695217, 0.26140156, 0.26140156,
       0.26362625, 0.26362625, 0.2647386 , 0.2647386 , 0.26585095,
       0.26585095, 0.26585095, 0.26585095, 0.26807564, 0.26807564,
       0.26807564, 0.27141268, 0.27141268, 0.27252503, 0.27252503,
       0.27697442, 0.27697442, 0.27919911, 0.27919911, 0.28031146,
       0.28031146, 0.28031146, 0.28698554, 0.28698554, 0.29254727,
       0.29254727, 0.29810901, 0.3003337 , 0.30144605, 0.30144605,
       0.30589544, 0.30589544, 0.30812013, 0.30812013, 0.31034483,
       0.31034483, 0.31368187, 0.31368187, 0.31813126, 0.31813126,
       0.32480534, 0.32480534, 0.33926585, 0.34149055, 0.34260289,
       0.34260289, 0.34593993, 0.34593993, 0.36151279, 0.36151279,
       0.36373749, 0.36373749, 0.37041157, 0.37263626, 0.37374861,
       0.37374861, 0.37486096, 0.37708565, 0.37931034, 0.38042269,
       0.38264739, 0.38264739, 0.38264739, 0.39154616, 0.39154616,
       0.3948832 , 0.3948832 , 0.40378198, 0.40711902, 0.40934372,
       0.40934372, 0.4137931 , 0.4137931 , 0.42380423, 0.42380423,
       0.42602892, 0.42602892, 0.42602892, 0.42714127, 0.42936596,
       0.43047831, 0.43047831, 0.43047831, 0.43159066, 0.43159066,
       0.43381535, 0.43381535, 0.43604004, 0.43604004, 0.44271413,
       0.44271413, 0.44271413, 0.44493882, 0.44493882, 0.44716352,
       0.44827586, 0.44827586, 0.44938821, 0.44938821, 0.44938821,
       0.4516129 , 0.4516129 , 0.45606229, 0.45717464, 0.45717464,
       0.45939933, 0.45939933, 0.46496107, 0.46607341, 0.46607341,
       0.46718576, 0.46718576, 0.46829811, 0.46829811, 0.4727475 ,
       0.47385984, 0.47497219, 0.47497219, 0.47942158, 0.47942158,
       0.48053393, 0.48053393, 0.48275862, 0.48275862, 0.48387097,
       0.48387097, 0.49499444, 0.49499444, 0.49833148, 0.49833148,
       0.49833148, 0.50055617, 0.50055617, 0.50611791, 0.50611791,
       0.50723026, 0.50723026, 0.52502781, 0.52502781, 0.52836485,
       0.53503893, 0.53615128, 0.53615128, 0.53837597, 0.53837597,
       0.53948832, 0.53948832, 0.54393771, 0.54393771, 0.5461624 ,
       0.55172414, 0.55728587, 0.55728587, 0.55951057, 0.55951057,
       0.56840934, 0.56840934, 0.57063404, 0.57063404, 0.57174638,
       0.57174638, 0.57285873, 0.57285873, 0.57508343, 0.57508343,
       0.58064516, 0.58064516, 0.58175751, 0.58175751, 0.58286986,
       0.58286986, 0.5839822 , 0.5839822 , 0.58954394, 0.58954394,
       0.59176863, 0.59176863, 0.59399333, 0.59399333, 0.59955506,
       0.59955506, 0.60066741, 0.60066741, 0.6028921 , 0.6028921 ,
       0.6051168 , 0.6051168 , 0.60734149, 0.60734149, 0.60734149,
       0.60845384, 0.60845384, 0.61624027, 0.61624027, 0.61735261,
       0.61735261, 0.62513904, 0.62513904, 0.63403782, 0.63403782,
       0.63515017, 0.63515017, 0.63848721, 0.63848721, 0.6407119 ,
       0.6407119 , 0.6407119 , 0.64182425, 0.64182425, 0.65183537,
       0.65183537, 0.66295884, 0.66295884, 0.67741935, 0.67741935,
       0.68520578, 0.68520578, 0.68631813, 0.68631813, 0.69299221,
       0.69299221, 0.69299221, 0.69632925, 0.69632925, 0.70077864,
       0.70077864, 0.70634038, 0.70634038, 0.7163515 , 0.7163515 ,
       0.72413793, 0.72413793, 0.72413793, 0.73414905, 0.73414905,
       0.73637375, 0.73637375, 0.77641824, 0.77641824, 0.78531702,
       0.78531702, 0.78865406, 0.78865406, 0.78865406, 0.79087875,
       0.79087875, 0.7942158 , 0.7942158 , 0.79977753, 0.79977753,
       0.81423804, 0.81423804, 0.81423804, 0.81423804, 0.81535039,
       0.81535039, 0.81757508, 0.81979978, 0.81979978, 0.82424917,
       0.82424917, 0.82647386, 0.82647386, 0.82758621, 0.8298109 ,
       0.8298109 , 0.8298109 , 0.8320356 , 0.8320356 , 0.83537264,
       0.83537264, 0.84093437, 0.84093437, 0.84204672, 0.84204672,
       0.84427141, 0.84427141, 0.84760845, 0.85650723, 0.85650723,
       0.85761958, 0.85761958, 0.85984427, 0.85984427, 0.86206897,
       0.86206897, 0.86318131, 0.87096774, 0.87096774, 0.87430478,
       0.87430478, 0.87430478, 0.88876529, 0.88876529, 0.88987764,
       0.88987764, 0.89432703, 0.89432703, 0.89988877, 0.89988877,
       0.90767519, 0.90767519, 0.91101224, 0.91101224, 0.91323693,
       0.91546162, 0.91546162, 0.91879867, 0.91879867, 0.92102336,
       0.92102336, 0.92324805, 0.92324805, 0.92547275, 0.92547275,
       0.92547275, 0.92880979, 0.92880979, 0.93103448, 0.93103448,
       0.93659622, 0.93659622, 0.93993326, 0.93993326, 0.94104561,
       0.94104561, 0.94660734, 0.94660734, 0.9621802 , 0.9621802 ,
       0.97441602, 0.97441602, 0.98331479, 0.98331479, 0.98331479,
       0.98887653, 0.98887653, 0.99888765, 0.99888765, 1.        ])
tpr_base = np.array([0.        , 0.        , 0.        , 0.00876095, 0.01126408,
       0.01376721, 0.01627034, 0.02753442, 0.03003755, 0.04130163,
       0.04130163, 0.04255319, 0.04505632, 0.05006258, 0.05256571,
       0.05381727, 0.05757196, 0.06132666, 0.06633292, 0.07008761,
       0.07259074, 0.077597  , 0.08010013, 0.08260325, 0.08510638,
       0.08886108, 0.09136421, 0.09261577, 0.09762203, 0.09887359,
       0.10137672, 0.10387985, 0.10638298, 0.10638298, 0.10763454,
       0.11013767, 0.1126408 , 0.11514393, 0.11639549, 0.11639549,
       0.11889862, 0.11889862, 0.12015019, 0.12015019, 0.12891114,
       0.13141427, 0.13141427, 0.13642053, 0.13642053, 0.14017522,
       0.14017522, 0.14768461, 0.14768461, 0.1514393 , 0.1514393 ,
       0.15269086, 0.15269086, 0.15644556, 0.15644556, 0.15894869,
       0.15894869, 0.16145181, 0.16395494, 0.16395494, 0.16770964,
       0.16770964, 0.17021277, 0.17021277, 0.17271589, 0.17271589,
       0.17647059, 0.17647059, 0.18022528, 0.18022528, 0.18272841,
       0.18272841, 0.18397997, 0.18397997, 0.1864831 , 0.19148936,
       0.19148936, 0.19274093, 0.19274093, 0.19774718, 0.19774718,
       0.20150188, 0.20150188, 0.20400501, 0.20525657, 0.20525657,
       0.20650814, 0.21151439, 0.21151439, 0.21276596, 0.21276596,
       0.21526909, 0.21652065, 0.21652065, 0.21902378, 0.21902378,
       0.22403004, 0.2252816 , 0.22653317, 0.22653317, 0.22778473,
       0.23153942, 0.23153942, 0.23779725, 0.23779725, 0.23904881,
       0.2428035 , 0.2428035 , 0.24530663, 0.24530663, 0.24906133,
       0.24906133, 0.25031289, 0.25031289, 0.25156446, 0.25156446,
       0.25281602, 0.25281602, 0.25531915, 0.25531915, 0.25657071,
       0.25657071, 0.25782228, 0.25782228, 0.25907384, 0.26157697,
       0.26282854, 0.26282854, 0.26533166, 0.26533166, 0.26658323,
       0.26658323, 0.26908636, 0.26908636, 0.27033792, 0.27284105,
       0.27659574, 0.28035044, 0.29662078, 0.2991239 , 0.32540676,
       0.32790989, 0.33416771, 0.3379224 , 0.33917397, 0.34292866,
       0.34418023, 0.34668335, 0.35043805, 0.35294118, 0.36795995,
       0.37046308, 0.37797247, 0.38047559, 0.38297872, 0.38548185,
       0.42803504, 0.43053817, 0.43929912, 0.44180225, 0.4505632 ,
       0.45306633, 0.46057572, 0.46307885, 0.47684606, 0.47934919,
       0.48310388, 0.48560701, 0.48811014, 0.49061327, 0.49436796,
       0.49687109, 0.5068836 , 0.5068836 , 0.50813517, 0.50813517,
       0.50938673, 0.51188986, 0.51689612, 0.51939925, 0.5281602 ,
       0.53316646, 0.53817272, 0.54067584, 0.54192741, 0.54443054,
       0.55944931, 0.56445557, 0.5669587 , 0.56946183, 0.57071339,
       0.57321652, 0.57697121, 0.57947434, 0.58573217, 0.58573217,
       0.59324155, 0.59699625, 0.60075094, 0.60325407, 0.60700876,
       0.60700876, 0.60951189, 0.62202753, 0.62202753, 0.6232791 ,
       0.6232791 , 0.62578223, 0.62578223, 0.62828536, 0.62828536,
       0.62953692, 0.62953692, 0.63078849, 0.63078849, 0.63204005,
       0.63204005, 0.63204005, 0.63204005, 0.63704631, 0.63704631,
       0.640801  , 0.640801  , 0.64455569, 0.64455569, 0.64580726,
       0.64580726, 0.64705882, 0.64705882, 0.64831039, 0.64831039,
       0.65206508, 0.65206508, 0.65331665, 0.65331665, 0.65581977,
       0.65581977, 0.6583229 , 0.6583229 , 0.65957447, 0.65957447,
       0.66082603, 0.66082603, 0.66332916, 0.66332916, 0.66458073,
       0.66458073, 0.66583229, 0.66583229, 0.66833542, 0.66833542,
       0.67209011, 0.67459324, 0.67709637, 0.67709637, 0.6795995 ,
       0.68085106, 0.68085106, 0.68210263, 0.68210263, 0.68585732,
       0.68585732, 0.69086358, 0.69086358, 0.69211514, 0.69211514,
       0.6971214 , 0.69962453, 0.69962453, 0.7008761 , 0.7008761 ,
       0.70337922, 0.70337922, 0.70337922, 0.70337922, 0.70588235,
       0.70588235, 0.70713392, 0.70713392, 0.70963705, 0.70963705,
       0.71088861, 0.71088861, 0.71339174, 0.71339174, 0.7146433 ,
       0.7146433 , 0.71589487, 0.71589487, 0.71589487, 0.71589487,
       0.71714643, 0.71714643, 0.718398  , 0.718398  , 0.71964956,
       0.71964956, 0.72090113, 0.72090113, 0.72090113, 0.72090113,
       0.72215269, 0.72215269, 0.72215269, 0.72215269, 0.72340426,
       0.72340426, 0.72465582, 0.72715895, 0.72715895, 0.72966208,
       0.72966208, 0.73216521, 0.73216521, 0.73216521, 0.73216521,
       0.73341677, 0.73341677, 0.7359199 , 0.7359199 , 0.73717146,
       0.73717146, 0.73842303, 0.74092616, 0.74092616, 0.74092616,
       0.74092616, 0.74342929, 0.74468085, 0.74468085, 0.74593242,
       0.74593242, 0.74718398, 0.74718398, 0.74843554, 0.74843554,
       0.75093867, 0.75219024, 0.75219024, 0.75469337, 0.75594493,
       0.75594493, 0.75844806, 0.75844806, 0.76095119, 0.76220275,
       0.76220275, 0.76470588, 0.76470588, 0.76470588, 0.76846058,
       0.76846058, 0.7709637 , 0.7709637 , 0.77221527, 0.7747184 ,
       0.7747184 , 0.77722153, 0.77722153, 0.77847309, 0.77847309,
       0.77972466, 0.77972466, 0.78222778, 0.78222778, 0.78347935,
       0.78347935, 0.78598248, 0.78598248, 0.78723404, 0.78723404,
       0.78973717, 0.78973717, 0.79098874, 0.79098874, 0.79599499,
       0.79724656, 0.79724656, 0.79849812, 0.79849812, 0.79974969,
       0.79974969, 0.80100125, 0.80100125, 0.80475594, 0.80475594,
       0.80475594, 0.80475594, 0.80600751, 0.80600751, 0.80725907,
       0.80725907, 0.8097622 , 0.8097622 , 0.81101377, 0.81101377,
       0.81101377, 0.81101377, 0.81226533, 0.81226533, 0.8135169 ,
       0.8135169 , 0.81602003, 0.81602003, 0.81727159, 0.81727159,
       0.81977472, 0.81977472, 0.82227785, 0.82227785, 0.82352941,
       0.82352941, 0.82478098, 0.82478098, 0.82603254, 0.82603254,
       0.82728411, 0.82728411, 0.82978723, 0.82978723, 0.8310388 ,
       0.8310388 , 0.83229036, 0.83229036, 0.83354193, 0.83354193,
       0.83479349, 0.83479349, 0.83604506, 0.83604506, 0.83729662,
       0.83729662, 0.83854819, 0.83854819, 0.83979975, 0.84230288,
       0.84230288, 0.84355444, 0.84355444, 0.84480601, 0.84480601,
       0.84730914, 0.84730914, 0.84981227, 0.84981227, 0.85106383,
       0.85106383, 0.85231539, 0.85231539, 0.85356696, 0.85356696,
       0.85481852, 0.85732165, 0.85732165, 0.85982478, 0.85982478,
       0.86232791, 0.86232791, 0.86483104, 0.86483104, 0.8660826 ,
       0.8660826 , 0.86858573, 0.86858573, 0.8698373 , 0.8698373 ,
       0.87234043, 0.87359199, 0.87359199, 0.87484355, 0.87484355,
       0.87609512, 0.87609512, 0.87859825, 0.87859825, 0.87984981,
       0.87984981, 0.88235294, 0.89236546, 0.89236546, 0.89486859,
       0.89486859, 0.89612015, 0.89612015, 0.89862328, 0.89862328,
       0.89987484, 0.89987484, 0.90237797, 0.90362954, 0.90362954,
       0.9048811 , 0.9048811 , 0.90613267, 0.90613267, 0.90863579,
       0.90863579, 0.90988736, 0.91239049, 0.91364205, 0.91364205,
       0.91489362, 0.91489362, 0.91489362, 0.91614518, 0.91614518,
       0.91864831, 0.91864831, 0.91989987, 0.91989987, 0.91989987,
       0.922403  , 0.92365457, 0.92365457, 0.92490613, 0.92490613,
       0.92740926, 0.92740926, 0.92866083, 0.92866083, 0.93491865,
       0.93491865, 0.93742178, 0.93742178, 0.93742178, 0.93867334,
       0.93867334, 0.94117647, 0.94117647, 0.94242804, 0.94242804,
       0.94618273, 0.94743429, 0.94743429, 0.94868586, 0.94868586,
       0.95118899, 0.95244055, 0.95244055, 0.95369212, 0.95369212,
       0.95619524, 0.95619524, 0.95744681, 0.95744681, 0.9612015 ,
       0.9612015 , 0.96245307, 0.96245307, 0.9649562 , 0.9649562 ,
       0.9649562 , 0.96620776, 0.96620776, 0.96871089, 0.96871089,
       0.96996245, 0.96996245, 0.97246558, 0.97246558, 0.97496871,
       0.97747184, 0.97747184, 0.97997497, 0.97997497, 0.9824781 ,
       0.9824781 , 0.98372966, 0.98372966, 0.98498123, 0.98498123,
       0.98623279, 0.98623279, 0.98873592, 0.98873592, 0.99123905,
       0.99123905, 0.99249061, 0.99249061, 0.99499374, 0.99624531,
       0.99624531, 0.99749687, 0.99749687, 1.        , 1.        ])
auc_base = 0.7011614907956414

In [None]:
import matplotlib.pyplot as plt

plt.plot(fpr_neta, tpr_neta, label="AUC_XGBOOST={:.3f}".format(auc_neta))
plt.plot(fpr_base, tpr_base, label="AUC_Logistic_Regression={:.3f}".format(auc_base), color='brown')

plt.plot(fpr, tpr, label="AUC_RandomForest={:.3f}".format(auc), color='red')
plt.plot(fpr_yaniv, tpr_yaniv, label="AUC_DL={:.3f}".format(auc_yaniv))

plt.plot(fpr_nofar, tpr_nofar, label="AUC_Stacking={:.3f}".format(auc_nofar), color='green')
    
plt.plot([0,1], [0,1], color='orange', linestyle='--')

plt.xticks(np.arange(0.0, 1.1, step=0.1))
plt.xlabel("False Positive Rate", fontsize=15)
plt.yticks(np.arange(0.0, 1.1, step=0.1))
plt.ylabel("True Positive Rate", fontsize=15)
plt.title('ROC Curve Analysis', fontweight='bold', fontsize=15)
plt.legend(prop={'size':13}, loc='lower right')

plt.rcParams["figure.figsize"] = (10,10)


plt.show()

In [None]:
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]

std = np.std([tree.feature_importances_ for tree in clf.estimators_],
             axis=0)
# Print the feature ranking
print("Feature ranking:")

for f in range(X.shape[1]):
    print(f"{f + 1}. feature: {train_df.columns[indices[f]]} ({importances[indices[f]]})")

# Plot the impurity-based feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X.shape[1]), importances[indices],
        color="r", yerr=std[indices], align="center")
plt.xticks(range(X.shape[1]), indices)
plt.xlim([-1, X.shape[1]])
plt.show()