<a href="https://colab.research.google.com/github/niroshank/asp-dotnet-mvc-l10n-app/blob/master/BOW_Coronavirus.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [108]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from google.colab import files
uploaded=files.upload()

Saving coronavirus_question_data.csv to coronavirus_question_data.csv


In [129]:
# Create a dataframe
df = pd.read_csv('coronavirus_question_data.csv',encoding='cp1252')
df.head(5)

Unnamed: 0,observed Date,question
0,4/14/2020,What are the Symptoms of Covid-19?
1,4/14/2020,How does Covid-19 spread?
2,4/14/2020,Can the virus that causes covid-19 be transmit...
3,4/14/2020,Can covid-19 be caught from a person who has n...
4,4/14/2020,What can we do to protect ourselves?


**Identify the data patterns**

In [130]:
# Number of words
df['word_count'] = df['question'].apply(lambda x: len(str(x).split(" ")))
df[['question','word_count']].head()

Unnamed: 0,question,word_count
0,What are the Symptoms of Covid-19?,6
1,How does Covid-19 spread?,4
2,Can the virus that causes covid-19 be transmit...,10
3,Can covid-19 be caught from a person who has n...,11
4,What can we do to protect ourselves?,7


In [131]:
len(df)

63

In [132]:
# Number of characters including spaces
df['char_count'] = df['question'].str.len()
df[['question','char_count']].head()

Unnamed: 0,question,char_count
0,What are the Symptoms of Covid-19?,34
1,How does Covid-19 spread?,25
2,Can the virus that causes covid-19 be transmit...,62
3,Can covid-19 be caught from a person who has n...,57
4,What can we do to protect ourselves?,36


In [133]:
# Average word length
def avg_word(sentence):
  words = sentence.split()
  return (sum(len(word) for word in words)/len(words))

df['avg_word'] = df['question'].apply(lambda x: avg_word(x))
df[['question','avg_word']].head()

Unnamed: 0,question,avg_word
0,What are the Symptoms of Covid-19?,4.833333
1,How does Covid-19 spread?,5.5
2,Can the virus that causes covid-19 be transmit...,5.3
3,Can covid-19 be caught from a person who has n...,4.272727
4,What can we do to protect ourselves?,4.285714


In [134]:
# Identify common words
freq = pd.Series(' '.join(df['question']).split()).value_counts()[:20]
freq

What            35
the             31
of              30
are             24
is              12
to              10
in              10
How              9
many             8
a                8
I                8
virus            7
number           7
measures         6
preventive       6
COVID19?         6
for              5
coronavirus?     5
corona           5
symptoms         5
dtype: int64

In [135]:
# Identify the rare words
freq = pd.Series(' '.join(df['question']).split()).value_counts()[-20:]
freq

us             1
colombo?       1
test           1
flourishing    1
make           1
health         1
nearest        1
today?         1
Updates        1
become         1
fatal          1
brew           1
human          1
spread?        1
LANKA          1
late           1
an             1
WE             1
into           1
cause          1
dtype: int64

In [150]:
# import nltk
# nltk.download('punkt')
# TF = (Number of times term T appears in the particular row) / (number of terms in that row)
tf1 = (df['question'][1:len(df)]).apply(lambda x: pd.value_counts(x.split(" "))).sum(axis = 0).reset_index()
tf1.columns = ['words','tf']

Unnamed: 0,words,tf
26,what,34.0
33,of,26.0
7,the,24.0
30,are,19.0
1,covid19,12.0
50,is,12.0
0,how,9.0
34,in,7.0
35,many,7.0
99,number,7.0


**Data preprocessing**

In [137]:
# Convert all the words into lowercase
df['question'] = df['question'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df['question'].head()

0                   what are the symptoms of covid-19?
1                            how does covid-19 spread?
2    can the virus that causes covid-19 be transmit...
3    can covid-19 be caught from a person who has n...
4                 what can we do to protect ourselves?
Name: question, dtype: object

In [138]:
# Remove punctuation
df['question'] = df['question'].str.replace('[^\w\s]','')
df['question'].head()

0                     what are the symptoms of covid19
1                              how does covid19 spread
2    can the virus that causes covid19 be transmitt...
3    can covid19 be caught from a person who has no...
4                  what can we do to protect ourselves
Name: question, dtype: object

In [0]:
Possible_Question_Key_Words = ["whats","what","where","when","why","isn't","whats","who","should","would","could","can","do","does","can","can","did","how","how many"]

In [139]:
from nltk.corpus import stopwords
stop = stopwords.words('english')
df['is_question'] = df['question'].apply(lambda x: len([x for x in x.split() if x in Possible_Question_Key_Words]))
df[['question','is_question']].head()

Unnamed: 0,question,is_question
0,what are the symptoms of covid19,1
1,how does covid19 spread,2
2,can the virus that causes covid19 be transmitt...,1
3,can covid19 be caught from a person who has no...,2
4,what can we do to protect ourselves,3


In [141]:
# Remove non question texts
 df = df.drop(df[df.is_question == 0].index)
 df.head()

Unnamed: 0,observed Date,question,word_count,char_count,avg_word,is_question
0,4/14/2020,what are the symptoms of covid19,6,34,4.833333,1
1,4/14/2020,how does covid19 spread,4,25,5.5,2
2,4/14/2020,can the virus that causes covid19 be transmitt...,10,62,5.3,1
3,4/14/2020,can covid19 be caught from a person who has no...,11,57,4.272727,2
4,4/14/2020,what can we do to protect ourselves,7,36,4.285714,3


In [0]:
# identifying combination
from collections import Counter
from itertools import chain

def find_ngrams(input_list, n):
    return list(zip(*[input_list[i:] for i in range(n)]))

In [143]:
df['question'][0]

'what are the symptoms of covid19'

In [144]:
# get word combination list
df['bigrams'] = df['question'].map(lambda x: find_ngrams(x.split(" "), 2))
df['bigrams'][0]

[('what', 'are'),
 ('are', 'the'),
 ('the', 'symptoms'),
 ('symptoms', 'of'),
 ('of', 'covid19')]

In [0]:
# Bigram Frequency Counts
bigrams = df['bigrams'].tolist()
bigrams = list(chain(*bigrams))
bigrams = [(x.lower(), y.lower()) for x,y in bigrams]

In [146]:
# get most common 10
bigram_counts = Counter(bigrams)
bigram_counts.most_common(20)

[(('what', 'are'), 18),
 (('are', 'the'), 18),
 (('what', 'is'), 11),
 (('of', 'covid19'), 10),
 (('the', 'symptoms'), 7),
 (('how', 'many'), 7),
 (('number', 'of'), 7),
 (('symptoms', 'of'), 6),
 (('the', 'preventive'), 6),
 (('preventive', 'measures'), 6),
 (('measures', 'of'), 6),
 (('of', 'corona'), 6),
 (('is', 'the'), 5),
 (('the', 'dispose'), 4),
 (('dispose', 'methods'), 4),
 (('methods', 'of'), 4),
 (('corona', 'virus'), 4),
 (('virus', 'patient'), 4),
 (('the', 'contact'), 4),
 (('contact', 'number'), 4)]

In [151]:
# get most frequnt words
tf1.sort_values('tf',ascending=False)[:10]

Unnamed: 0,words,tf
26,what,34.0
33,of,26.0
7,the,24.0
30,are,19.0
1,covid19,12.0
50,is,12.0
0,how,9.0
34,in,7.0
35,many,7.0
99,number,7.0
