In [None]:
# Load EDA Pkgs
import pandas as pd
import numpy as np


In [None]:
# Load Data Viz
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
# Load Sentiment Pkgs
from textblob import TextBlob


In [None]:
# Load Dataset
df = pd.read_csv("drugsCom_raw/drugsComTrain_raw.tsv",sep='\t')

In [None]:
# Preview Dataset
df.head()

In [None]:
# Columns
df.columns

In [None]:
# Missing Values
df.isnull().sum()

In [None]:
# How many drugs do we have?
df['drugName'].unique().tolist()

In [None]:
# How many drugs do we have?
len(df['drugName'].unique().tolist())

In [None]:
# What is the most popular drug?
df['drugName'].value_counts()

In [None]:
# What is the most popular drug?
# Top 20 Drugs (Most Popular)
df['drugName'].value_counts().nlargest(20)

In [None]:
# Top 20 Drugs (Most Popular)
plt.figure(figsize=(20,10))
df['drugName'].value_counts().nlargest(20).plot(kind='bar')
plt.title("Top 20 Most popular drugs based on counts")
plt.show()

In [None]:
# Least 20 Drugs (Most Popular)
df['drugName'].value_counts().nsmallest(20)

In [None]:
df['drugName'].value_counts().nsmallest(20).plot(kind='bar')

In [None]:
drug_suffix = {"azole":"antifungal (except metronidazole)",
"caine":"anesthetic",
"cillin":"antibiotic(penicillins)",
"mycin":"antibiotic",
"micin":"antibiotic",
"cycline":"antibiotic",
"oxacin":"antibiotic",
"ceph":"antibiotic(cephalosporins)",
"cef":"antibiotic (cephalosporins)",
"dine":"h2 blockers (anti-ulcers)",
"done":"opiod analgesics",
"ide":"oral hypoglycemics",
"lam":"anti-anxiety",
"pam":"anti-anxiety",
"mide":"diuretics",
"zide":"diuretics",
"nium":"neuromuscular blocking agents",
"olol":"beta blockers",
"tidine":"h2 antagonist",
"tropin":"pituitary hormone",
"zosin":"alpha blocker",
"ase":"thrombolytics",
"plase":"thrombolytics",
"azepam":"anti-anziety(benzodiazepine)",
"azine":"antipyschotics (phenothiazine)",
"barbital":"barbiturate",
"dipine":"calcium channel blocker",
"lol":"beta blocker",
"zolam":"cns depressants",
"pril":"ace inhibitor",
"artan":"arb blocker",
"statins":"lipid-lowering drugs",
"parin":"anticoagulants",
"sone":"corticosteroid (prednisone)"}


In [None]:
def classify_drug(drugname):
    for i in drug_suffix.keys():
        if drugname.endswith(i):
            print(True)
            print(drug_suffix[i])

In [None]:
classify_drug('Valsartan')

In [None]:
classify_drug('losartan')

In [None]:
def classify_drug(drugname):
    for i in drug_suffix.keys():
        if drugname.endswith(i):
            return drug_suffix[i]

In [None]:
classify_drug('valsartan')

In [None]:
df['drug_class'] = df['drugName'].apply(classify_drug)

In [None]:
df[['drugName','drug_class']]

In [None]:
# How many Groups of Drugs By Class
df['drug_class'].unique().tolist()

In [None]:
# How many Groups of Drugs By Class
len(df['drug_class'].unique().tolist())

In [None]:
# Which of class of drug  is the most commonest
df['drug_class'].value_counts()

In [None]:
# Which of class of drug  is the most commonest
plt.figure(figsize=(20,10))
df['drug_class'].value_counts().plot(kind='bar')
plt.title("Distribution of Drugs By Class")
plt.show()


In [None]:
# Distribution of Drugs Per Drug Group based on size
drug_groups = df.groupby('drug_class').size()

In [None]:
type(drug_groups)

In [None]:
# Convert to DF
# Method 1
drug_groups.to_frame()

In [None]:
# Convert to DF
# Method 2
drug_groups_df = pd.DataFrame({'drug_class':drug_groups.index,'counts':drug_groups.values})

In [None]:
# Seaborn Plot
plt.figure(figsize=(20,10))
g = sns.barplot(data=drug_groups_df,x='drug_class',y='counts')
plt.show()

In [None]:
# Seaborn Plot
plt.figure(figsize=(20,10))
g = sns.barplot(data=drug_groups_df,x='drug_class',y='counts')
g.set_xticklabels(drug_groups_df['drug_class'].values,rotation=30)
plt.show()

In [None]:
# Seaborn Plot
plt.figure(figsize=(20,10))
g = sns.barplot(data=drug_groups_df,x='drug_class',y='counts')
plt.xticks(rotation=30)
plt.show()

In [None]:
# Number of Conditions
df['condition'].unique()

In [None]:
len(df['condition'].unique().tolist())

In [None]:
#### Distribution of Conditions
df['condition'].value_counts()

In [None]:
#### Most commonest Conditions
df['condition'].value_counts().nlargest(20)

In [None]:
#### Most commonest Conditions
df['condition'].value_counts().nlargest(20).plot(kind='bar',figsize=(20,10))

In [None]:
df['condition'].value_counts().nsmallest(20)

In [None]:
#### Least commonest Conditions
df['condition'].value_counts().nsmallest(20).plot(kind='bar',figsize=(20,10))

In [None]:
# How many Drugs per condition (Top 20)
df.groupby('condition')['drugName'].nunique().nlargest(20)

In [None]:
# How many Drugs per condition (Top 20)
plt.figure(figsize=(15,10))
df.groupby('condition')['drugName'].nunique().nlargest(20).plot(kind='bar')
plt.title("Number of Drugs Per Condition")
plt.grid()
plt.show()

In [None]:
df['rating']

In [None]:
# Distrubtion of Rating By Size
df.groupby('rating').size()

In [None]:
# Distrubtion of Rating By Size
df.groupby('rating').size().plot(kind='bar')

In [None]:
# # Distrubtion of Rating By Size Using Histogram
plt.figure(figsize=(20,10))
df['rating'].hist()
plt.title("Distrubtion of Rating By Size Using Histogram")
plt.show()

In [None]:
# Average Rating of Drugs
avg_rating = (df['rating'].groupby(df['drugName']).mean())

In [None]:
avg_rating

In [None]:
# Average Rating For All Drugs
plt.figure(figsize=(20,10))
avg_rating.hist()
plt.title("Distrubtion of Average Rating For All Drugs")
plt.show()

In [None]:
# Average Rating of Drugs By Class
avg_rating_per_drug_class = (df['rating'].groupby(df['drug_class']).mean())

In [None]:
avg_rating_per_drug_class

In [None]:
# Average Rating For All Drugs
plt.figure(figsize=(20,10))
avg_rating_per_drug_class.hist()
plt.title("Distrubtion of Average Rating For Drug Classes")
plt.show()


In [None]:
# Which Group of Drugs have the higest mean/average rating
avg_rating_per_drug_class.nlargest(20)

In [None]:
# Which Drugs have the higest mean/average rating
avg_rating.nlargest(20)

In [None]:
df.columns

In [None]:
# How genuine is the review? (Using sentiment analysis)
from textblob import TextBlob

In [None]:
df['review']

In [None]:
def get_sentiment(text):
    blob = TextBlob(text)
    return blob.polarity

def get_sentiment_label(text):
    blob = TextBlob(text)
    if blob.polarity > 0:
        result = 'positive'
    elif blob.polarity < 0:
        result = 'negative'
    else:
        result = 'neutral'
    return result

In [None]:
# text fxn
get_sentiment("I love apples")

In [None]:
# text fxn
get_sentiment_label("I love apples")

In [None]:
# Sentiment Score for Review
df['sentiment'] = df['review'].apply(get_sentiment)

In [None]:
# Sentiment Labels for Review
df['sentiment_label'] = df['review'].apply(get_sentiment_label)

In [None]:
df[['review','sentiment','sentiment_label']]

In [None]:
# How many positive and negative and neutral reviews?
df['sentiment_label'].value_counts()

In [None]:
# How many positive and negative and neutral reviews?
df['sentiment_label'].value_counts().plot(kind='bar')

In [None]:
#### Correlation Between Our sentiment and rating
sns.lineplot(data=df,x='rating',y='sentiment')
plt.show()

In [None]:
# Correlation  btween rating and sentiment
sns.lineplot(data=df,x='rating',y='sentiment',hue='sentiment_label')

In [None]:
# Genuine Good  Rating Per Review
good_review =  df[(df['rating'] >= 6) & (df['sentiment_label'] == 'positive')]

In [None]:
# Genuine Bad  Rating Per Review
bad_review = df[(df['rating'] <= 4) & (df['sentiment_label'] == 'negative')]

In [None]:
good_review.head()

In [None]:
good_review.iloc[0]['review']

In [None]:
df.groupby('drugName')['usefulCount'].value_counts()

In [None]:
# Top Drugs Per UsefulCount
df.groupby('drugName')['usefulCount'].nunique().nlargest(20)

In [None]:
# Top Drugs Per UsefulCount
df.groupby('drugName')['usefulCount'].nunique().nlargest(20).plot(kind='bar')

In [None]:
# Top Drugs Class Per UsefulCount
df.groupby('drug_class')['usefulCount'].nunique().nlargest(20)

In [None]:
# Top Drugs Class Per UsefulCount
df.groupby('drug_class')['usefulCount'].nunique().nlargest(20).plot(kind='bar')
plt.title("Top Drug Class Per Usefulcount")
plt.show()

In [None]:
# Top Drugs Class Per UsefulCount
df.groupby('drug_class')['usefulCount'].nunique().nsmallest(20).plot(kind='bar')
plt.title("Least Drug Class Per Usefulcount")
plt.show()

In [None]:
### Correlation between Rating and Usefulcount
sns.lineplot(data=df,x='rating',y='usefulCount')

In [None]:
#### Question on Date
df.columns

In [None]:
# Rating Per Year
df.groupby('date')['rating'].size()

In [None]:
# Averaging Rating Per Day of A Year
df.groupby('date')['rating'].mean()

In [None]:
# Average Rating Per Day of Every Year
df.groupby('date')['rating'].mean().plot(figsize=(20,10))
plt.title("Average Rating Per Day of Every Year")
plt.show()


In [None]:
# Average Useful Per Day of Every Year
df.groupby('date')['usefulCount'].mean().plot(figsize=(20,10))
plt.title("Average UsefulCount Per Day of Every Year")
plt.show()

In [None]:
# Average Sentiment Per Day of Every Year
df.groupby('date')['sentiment'].mean().plot(figsize=(20,10))
plt.title("Average sentiment Per Day of Every Year")
plt.show()


In [None]:
# Amount of Review Per Day of Every Year
df.groupby('date')['review'].size().plot(figsize=(20,10))
plt.title("Amount of Review Per Day of Every Year")
plt.show()

In [None]:
# Amount of Review Per Day of Every Year
df.groupby('date')['review'].size().plot(kind='bar',figsize=(20,10))
plt.title("Amount of Review Per Day of Every Year")
plt.show()

In [None]:
####  Using DatetimeIndex
grouped_date = df.groupby('date').agg({'rating':np.mean,'usefulCount':np.sum,'review':np.size})

In [None]:
grouped_date

In [None]:
grouped_date.index

In [None]:
grouped_date['date'] = grouped_date.index

In [None]:
grouped_date['date'] = pd.DatetimeIndex(grouped_date['date'])

In [None]:
grouped_date.dtypes

In [None]:
grouped_date = grouped_date.set_index('date')

In [None]:
# Select A Particular Date Range
grouped_date['2008'].plot()

In [None]:
# AMount of Review Fr 2008
grouped_date['2008']['review'].plot()
plt.title("Amount of Review For 2008")
plt.show()

In [None]:
# AMount of Review Fr 2008
grouped_date['2008':'2009']['review'].plot()
plt.title("Amount of Review For 2008-2009")
plt.show()

In [None]:
# Distribution of Rating Over Time
grouped_date['2008':'2009']['rating'].plot()
plt.title("Distribution of Rating Over Time")
plt.show()

In [None]:
# Distribution of Rating Over Time
grouped_date['2008':'2012']['rating'].plot(figsize=(20,10))
plt.title("Distribution of Rating Over Time")
plt.show()

In [None]:
grouped_date['2008-04'].plot()

In [None]:
# Distribution of Rating Over A Month
grouped_date['2008-4':'2008-5']['rating'].plot()
plt.title("Distribution of Rating Over Time")
plt.show()

In [None]:
# Save Dataset
df.to_csv("drug_review_dataset_with_sentiment.csv",index=False)