**Import library**

In [30]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')
nltk.download('stopwords')

from nltk.corpus import stopwords
import string
import json
from nltk.sentiment.util import *
from textblob import TextBlob

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [7]:
#Import and Read the data

with open('tweets.json') as jfile:
    json_file = json.load(jfile)

**Convert into Data Frame that easy to readable**

In [8]:
df=pd.DataFrame(json_file).T

In [9]:
df.head(10)

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,⚕️ Scientists conducted a Phase II study of ac...
1374032432173842437,"Michael Wang, MD",This phase 2 Acalabrutinib-Venetoclax (AV) tri...
1373902876553048065,1stOncology,#NICE backs #AstraZenecas #Calquence for #CLL ...
1373656782367813635,Toby Eyre,#acalabrutinib is a valuable option in pts int...
1372941634334232586,Lymphoma Hub,NICE has recommended the use of acalabrutinib ...
1372927482278539265,David Ledger,NICE backs AstraZeneca’s Calquence for CLL htt...
1372911723305394179,N Wales Cancer Forum,This is England for now - these decisions usua...
1372888121159868423,European Pharmaceutical Review,"AstraZeneca’s Calquence (acalabrutinib), a che..."
1372866915081797632,Graham Collins,Superstar @tobyeyre82 responding to the excell...
1372825553837944834,CLL Ireland,CLL patients all know the drug Ibrutinib and y...


**check null values**

**Total no of unique writer**

In [10]:
print('total no of null valus in the data:\n',df.isnull().sum())
print('total no tweet author:',df.tweet_author.nunique())

total no of null valus in the data:
 tweet_author    0
tweet_text      0
dtype: int64
total no tweet author: 9292


**Clean the data and convert them into Machine Readable Code**

In [12]:
stopword = stopwords.words('english')
def clean_data(data):
    data = data.lower()
    data = re.sub(r'[^(a-zA-Z)\s]','', data)
     #     remove urls
    data = re.sub(r'http\S+', " ", data)
    #     remove mentions
    data = re.sub(r'@\w+',' ',data)
    #     remove hastags
    data = re.sub(r'#\w+', ' ',data)
    #     remove digits
    data = re.sub(r'\d+()', ' ', data)
    #     remove html tags and umber
    data = re.sub('r<.*?>',' ', data)
    #     remove stop words 
    data = data.split()
    data = " ".join([word for word in data if not word in stopword])
    return data
df['tweet_text']=df['tweet_text'].apply(lambda x:clean_data(x))
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

df['tweet_text']= df['tweet_text'].apply(lambda x: remove_punct(x))
#Tokenization of the text data
def tokenization(text):
    text = re.split('\W+', text)
    return text
df['tweet_text'] = df['tweet_text'].apply(lambda x: tokenization(x.lower()))



**Get the most frequent entities from the tweets. and we convert them into csv**

In [13]:
df1=df['tweet_text']

**Steaming anf Lemitization of the code**

In [14]:
ps = nltk.PorterStemmer()
def stemming(text):
    text = [ps.stem(word) for word in text]
    return text
df1 = df1.apply(lambda x: stemming(x))

In [15]:
wn = nltk.WordNetLemmatizer()
def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text
df1= df1.apply(lambda x: lemmatizer(x))

**Stanford NLP NER**

In [19]:
from nltk.tag.stanford import StanfordNERTagger

nltk.download('punkt')

st = StanfordNERTagger('stanford-ner-2018-10-16/classifiers/english.all.3class.distsim.crf.ser.gz',
                       'stanford-ner-2018-10-16/stanford-ner.jar',
                       encoding='utf-8')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lohit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
val = [item if isinstance(df1, str) else " ".join(item) for item in df1 ]
seen = set()
val = [x for x in val if x not in seen and not seen.add(x)]

In [21]:
classified_text = st.tag(val)

In [24]:
entity= pd.DataFrame(classified_text,columns=['Entity Name','Entity Type'])

#Here We Remove Entities_type column from data we does not requierd this but the help to recgonized the type of entity

all_entities = (entity.groupby(by=['Entity Name'])
                          .size()
                          .sort_values(ascending=False)
                          .reset_index().rename(columns={0 : 'Frequency'}))
all_entities.head(10)

Unnamed: 0,Entity Name,Frequency
0,acalabrutinib,1306
1,calquenc,893
2,patient,790
3,covid,694
4,astrazeneca,598
5,cll,562
6,trial,425
7,lymphocyt,388
8,chronic,351
9,leukemia,342


In [25]:
all_entities.to_csv('entity_csv')

**Find out the sentiment/polarity of each author towards each of the entities**

In [31]:
df['tweet_text'] = df['tweet_text'].astype('str')
def get_polarity(text):
    return TextBlob(text).sentiment.polarity
df['Polarity'] = df['tweet_text'].apply(get_polarity)

In [32]:
df.to_csv('objective2')

In [28]:
df.head(10)

Unnamed: 0,tweet_author,tweet_text
1374140386071961602,Hematopoiesis News,"['scientists', 'conducted', 'phase', 'ii', 'st..."
1374032432173842437,"Michael Wang, MD","['phase', 'acalabrutinibvenetoclax', 'av', 'tr..."
1373902876553048065,1stOncology,"['nice', 'backs', 'astrazenecas', 'calquence',..."
1373656782367813635,Toby Eyre,"['acalabrutinib', 'valuable', 'option', 'pts',..."
1372941634334232586,Lymphoma Hub,"['nice', 'recommended', 'use', 'acalabrutinib'..."
1372927482278539265,David Ledger,"['nice', 'backs', 'astrazenecas', 'calquence',..."
1372911723305394179,N Wales Cancer Forum,"['england', 'decisions', 'usually', 'come', 'w..."
1372888121159868423,European Pharmaceutical Review,"['astrazenecas', 'calquence', 'acalabrutinib',..."
1372866915081797632,Graham Collins,"['superstar', 'tobyeyre', 'responding', 'excel..."
1372825553837944834,CLL Ireland,"['cll', 'patients', 'know', 'drug', 'ibrutinib..."


**Thank you**