#### Import Modules

In [1]:
import sys, os
import re
import pandas as pd
import cohere
from dotenv import load_dotenv
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer
sys.path.append(os.path.abspath(os.path.join('../scripts')))
from read_write_util import ReadWriteUtil
from prompt_pipeline import PromptPipeline

In [2]:
reader = ReadWriteUtil()
API_KEY = os.getenv('API_KEY')
# print(API_KEY)
#Initialize reader and API_KEY

In [3]:
df = pd.read_csv('../data/news.csv')
df.head()
#Read data

Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13


`Lets check for lengths of the body`

In [4]:
lengths = df['Body'].apply(lambda x: len(x))
lengths


0    13712
1      267
2     8273
3    22098
4     9554
5     1899
6     1729
7     3061
8     4428
9     5257
Name: Body, dtype: int64

`We can see that the lowest number of charachters we have is 267 most are above 2048 which is the highest number of tokens accepted by cohere, so we have to find a new approach to preprocess this data`

`We will try to understand the context of the body`

### Cleaning and Preprocessing

#### Convert to lower case

In [5]:
df['body_lower'] = [text.lower() for text in df['Body']]

In [6]:
df['body_lower']

0    boris johnson using a taxpayer-funded jet for ...
1    stumbled across an interesting case, a woman f...
2    le rapport d’étude de marché résines dans les ...
3    how to drive the funnel through content market...
4    global triacetin vertrieb-markt 2021 von herst...
5    south african police service office of the pro...
6    today is the 7th anniversary [tragic collapse ...
7    construction activity grew steadily by 4% in t...
8    - former eskom ceo matshela moses koko sought ...
9    global and regional beta-carotene market resea...
Name: body_lower, dtype: object

#### Tokenization

In [9]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/n/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [11]:
wtoken = [word_tokenize(text) for text in df['body_lower']]
# wtoken
stoken = [sent_tokenize(text) for text in df['body_lower']]

#### Remove Punctuation

In [24]:
reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

no_punc = []

for filt in stoken:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    no_punc.append(review)
    
# print(no_punc)

#### Remove Stop Words

In [16]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /home/n/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [25]:
no_stop = []

for text in no_punc:
    new_term_vector = []
    for word in text:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
            
    no_stop.append(new_term_vector)

#### Stemming and lemmatization

In [19]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/n/nltk_data...


True

In [21]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /home/n/nltk_data...


True

In [26]:
pstem = PorterStemmer()
wlem = WordNetLemmatizer()
preproc_text = []

for text in no_stop:
    final_text = []
    for word in text:
        pstem.stem(word)
        final_text.append(wlem.lemmatize(word))
    
    preproc_text.append(final_text)


#### Remove web words

In [23]:
reg = re.compile(r'(http)|(^www)|(html$)|(\w*\d{3,}\w*)')

tuned_text = []

for filt in preproc_text:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    tuned_text.append(review)

In [30]:
df['context_txt'] = [" ".join(word) for word in tuned_text]
le = df['context_txt'].apply(lambda x: len(x))
le1 = df['Body'].apply(lambda x: len(x))
le,le1

(0     9350
 1      187
 2     5480
 3    17921
 4     8406
 5     1247
 6     1189
 7     2098
 8     2916
 9     3775
 Name: context_txt, dtype: int64,
 0    13712
 1      267
 2     8273
 3    22098
 4     9554
 5     1899
 6     1729
 7     3061
 8     4428
 9     5257
 Name: Body, dtype: int64)

`We can see that we still have more characters than the accepted one, so we have to discard from our prompt the one that have more characters`

In [31]:
title_length = df['Title'].apply(lambda x: len(x))
description_length = df['Description'].apply(lambda x: len(x))
title_length

0    121
1    267
2    222
3     43
4    156
5    103
6     54
7     78
8     84
9    150
Name: Title, dtype: int64

In [32]:
description_length

0    251
1    267
2    245
3    248
4    251
5    251
6    253
7    253
8    252
9    249
Name: Description, dtype: int64

### Preprocess description 

#### Convert to lowercase

In [33]:
df['desc_lower'] = [text.lower() for text in df['Description']]
df['desc_lower']

0    …often trigger a protest vote that can upset…t...
1    stumbled across an interesting case, a woman f...
2    …covid-19…covid…covid…covid-19 et post covid…c...
3    …hate raiders' linked to automated harassment ...
4    …abschnitten und endanwendungen / organisation...
5    …crime stamp out…n1 and r101 roads appear in c...
6    …in lagos, nigeria, 84 south africans were kil...
7    …additional spending on buildings, repairs and...
8    …lawsuit against public participation) designe...
9    …key players! – dsm – basf – allied biotech – ...
Name: desc_lower, dtype: object

#### Tokenization

In [34]:
desc_sent_token = [sent_tokenize(text) for text in df['Description']]

#### Removing punctuation

In [35]:
reg = re.compile('(@[a-z0-9]+)|([^0-9a-z \t])|(\w+:\/\/\S+)')

no_punc = []

for filt in desc_sent_token:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    no_punc.append(review)


#### Removing stop words

In [36]:
no_stop = []

for text in no_punc:
    new_term_vector = []
    for word in text:
        if not word in stopwords.words('english'):
            new_term_vector.append(word)
            
    no_stop.append(new_term_vector)

#### Stemming and lemmatization

In [37]:
preproc_text = []

for text in no_stop:
    final_text = []
    for word in text:
        pstem.stem(word)
        final_text.append(wlem.lemmatize(word))
    
    preproc_text.append(final_text)

In [38]:
reg = re.compile(r'(http)|(^www)|(html$)|(\w*\d{3,}\w*)')

tuned_text = []

for filt in preproc_text:
    review = []
    for token in filt:
        new_token = reg.sub(u'', token)
        if not new_token == u'':
            review.append(new_token)
    tuned_text.append(review)

In [39]:

df['desc_context_txt'] = [" ".join(word) for word in tuned_text]

In [40]:
df['desc_context_txt']

0    often trigger a protest vote that can upsettha...
1    tumbled across an interesting case a woman fac...
2     et ost ovid19  valuation des risques lis au 1...
3    hate raiders linked to automated harassment ca...
4    bschnitten und ndanwendungen  rganisationen ov...
5    rime tamp ut1 and  roads appear in court   rim...
6    in agos igeria 84 outh fricans were killed he ...
7    additional spending on buildings repairs and s...
8    awsuit gainst ublic articipation designed to i...
9    key players      llied iotech  hrhistorical ma...
Name: desc_context_txt, dtype: object

### Classify to Groups/Classes Based on Analyst_Average_Score

In [41]:
df

Unnamed: 0,Domain,Title,Description,Body,Link,timestamp,Analyst_Average_Score,Analyst_Rank,Reference_Final_Score,body_lower,context_txt,desc_lower,desc_context_txt
0,rassegnastampa.news,Boris Johnson using a taxpayer-funded jet for ...,…often trigger a protest vote that can upset…t...,Boris Johnson using a taxpayer-funded jet for ...,https://rassegnastampa.news/boris-johnson-usin...,2021-09-09T18:17:46.258006,0.0,4,1.96,boris johnson using a taxpayer-funded jet for ...,boris johnson using taxpayerfunded jet electio...,…often trigger a protest vote that can upset…t...,often trigger a protest vote that can upsettha...
1,twitter.com,"Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...","Stumbled across an interesting case, a woman f...",http://twitter.com/CoruscaKhaya/status/1435585...,2021-09-08T13:02:45.802298,0.0,4,12.0,"stumbled across an interesting case, a woman f...",stumbled across interesting case woman facing ...,"stumbled across an interesting case, a woman f...",tumbled across an interesting case a woman fac...
2,atpe-tchad.info,Marché Résines dans les peintures et revêtemen...,…COVID-19…COVID…COVID…COVID-19 et Post COVID…C...,Le rapport d’étude de marché Résines dans les ...,http://atpe-tchad.info/2021/09/13/marche-resin...,2021-09-13T07:32:46.244403,0.0,4,0.05,le rapport d’étude de marché résines dans les ...,le rapport tude de march rsines dans le peintu...,…covid-19…covid…covid…covid-19 et post covid…c...,et ost ovid19 valuation des risques lis au 1...
3,badbluetech.bitnamiapp.com,"AI drives data analytics surge, study finds",…hate raiders' linked to automated harassment ...,How to drive the funnel through content market...,http://badbluetech.bitnamiapp.com/p.php?sid=21...,2021-09-11T00:17:45.962605,0.0,4,6.1,how to drive the funnel through content market...,drive funnel content marketing link building s...,…hate raiders' linked to automated harassment ...,hate raiders linked to automated harassment ca...
4,kryptogazette.com,Triacetin Vertrieb Markt 2021: Globale Unterne...,…Abschnitten und Endanwendungen / Organisation...,Global Triacetin Vertrieb-Markt 2021 von Herst...,https://kryptogazette.com/2021/09/08/triacetin...,2021-09-08T12:47:46.078369,0.0,4,0.13,global triacetin vertrieb-markt 2021 von herst...,global triacetin vertriebmarkt von herstellern...,…abschnitten und endanwendungen / organisation...,bschnitten und ndanwendungen rganisationen ov...
5,mype.co.za,Male arrested for the murder of an elderly fem...,…Crime Stamp Out…N1 and R101 roads appear in c...,South African Police Service Office of the Pro...,https://mype.co.za/new/male-arrested-for-the-m...,2021-09-10T00:17:46.055622,1.33,2,11.0,south african police service office of the pro...,south african police service office provincial...,…crime stamp out…n1 and r101 roads appear in c...,rime tamp ut1 and roads appear in court rim...
6,eminetra.co.za,7th Anniversary of SCOAN Collapse in Nigeria-S...,"…in Lagos, Nigeria, 84 South Africans were kil...",Today is the 7th anniversary [Tragic collapse ...,https://eminetra.co.za/7th-anniversary-of-scoa...,2021-09-12T05:17:50.279081,0.0,4,10.1,today is the 7th anniversary [tragic collapse ...,today 7th anniversary tragic collapse building...,"…in lagos, nigeria, 84 south africans were kil...",in agos igeria 84 outh fricans were killed he ...
7,eminetra.co.za,The construction sector is expected to be boos...,"…additional spending on buildings, repairs and...",Construction activity grew steadily by 4% in t...,https://eminetra.co.za/the-construction-sector...,2021-09-09T09:02:46.320793,1.66,1,1.36,construction activity grew steadily by 4% in t...,construction activity grew steadily 4 second q...,"…additional spending on buildings, repairs and...",additional spending on buildings repairs and s...
8,news24.com,News24.com | Court dismisses attempt by former...,…Lawsuit Against Public Participation) designe...,- Former Eskom CEO Matshela Moses Koko sought ...,https://www.news24.com/news24/southafrica/news...,2021-09-09T19:32:46.239682,0.33,3,2.4,- former eskom ceo matshela moses koko sought ...,former eskom ceo matshela moses koko sought da...,…lawsuit against public participation) designe...,awsuit gainst ublic articipation designed to i...
9,manometcurrent.com,Global and Regional Beta-Carotene Market Resea...,…key players! – DSM – BASF – Allied Biotech – ...,Global and Regional Beta-Carotene Market Resea...,https://manometcurrent.com/global-and-regional...,2021-09-13T03:02:45.609228,0.0,4,0.22,global and regional beta-carotene market resea...,global regional betacarotene market research r...,…key players! – dsm – basf – allied biotech – ...,key players llied iotech hrhistorical ma...


In [42]:
rank__to_10 = df['Analyst_Average_Score'].apply(lambda x: 'low' if x < 5 else 'high')
rank__to_10
#We will add new column with value 'high' for Analyst_Average_Score >= 5 and 'low' for Analyst_Average_Score < 5 and >=0

0    low
1    low
2    low
3    low
4    low
5    low
6    low
7    low
8    low
9    low
Name: Analyst_Average_Score, dtype: object

In [43]:
df['rank_to_10'] = rank__to_10
df['rank_to_10']

0    low
1    low
2    low
3    low
4    low
5    low
6    low
7    low
8    low
9    low
Name: rank_to_10, dtype: object

In [None]:
def handle_sub_class(value):
    if value >= 0 and value < 1:
        return "low_1"
    elif value >= 1 and value < 2:
        return "low_2"
    elif value >= 2 and value < 3:
        return "low_3"
    elif value >= 3 and value < 4:
        return "low_4"
    elif value >= 4 and value < 5:
        return "low_5"
    elif value >= 5 and value < 6:
        return "high_1"
    elif value >= 6 and value < 7:
        return "high_2"
    elif value >= 7 and value < 8:
        return "high_3"
    elif value >= 8 and value < 9:
        return "high_4"
    else:
        return "high_5"

#We will add new column with value 'high_1' for rows having Analyst_Average_Score value of 5-6, high_2 for rows having Analyst_Average_Score value of 6-7 til high_5 for rows having Analyst_Average_Score value of 9-10 
#We will add new column with value 'low_1' for rows having Analyst_Average_Score value of 0-1, low_2 for rows having Analyst_Average_Score value of 1-2 til low_5 for rows having Analyst_Average_Score value of 4-5 