In [310]:
# importing libraries and packages
import urllib.request
from bs4 import BeautifulSoup
from langdetect import detect
import GetOldTweets3 as got
import pandas as pd
import ssl
import requests
ssl._create_default_https_context = ssl._create_unverified_context

# 1) Scraping Twitter

### @Rwanda_Edu (Ministry of Education) & @REBRwanda (Rwanda Education Board)

In [290]:
"""
Function to get Tweets data from Twitter
"""
def get_twitter_data(handle, start_date, end_date):
    # filters
    filters = got.manager.TweetCriteria().setUsername(handle)\
                          .setSince(start_date)\
                          .setUntil(end_date)\
    
    # scraping
    tweet = got.manager.TweetManager.getTweets(filters)
    
    # creating list
    data = [[tw.username,
                tw.text,
                tw.date,
                tw.retweets,
                tw.favorites,
                tw.mentions,
                tw.hashtags] for tw in tweet]
    
    # transforming list into dataframe
    tweets_df = pd.DataFrame(data, columns = ['username', 
                                              'tweet', 
                                              'date', 
                                              'favorites', 
                                              'retweets', 
                                              'mentions', 
                                              'hashTags'])
    
    return tweets_df

In [292]:
# Twitter accounts to scrape
twitter_sources = ['rwanda_edu', 'REBRwanda']
# getting tweets from the sources from April 1st 2020 until August 13th 2020
tweets_df = get_twitter_data(twitter_sources, 
                     start_date = "2020-04-01", 
                     end_date = "2020-08-13").sort_values('date', ascending=False)

In [293]:
tweets_df

Unnamed: 0,username,tweet,date,favorites,retweets,mentions,hashTags
0,Rwanda_Edu,The Minister of Education .@Dr_Uwamariya appre...,2020-08-12 12:50:42+00:00,9,76,@Dr_Uwamariya @Uni_Rwanda,
1,Rwanda_Edu,Discussions centered on enhancement of impacti...,2020-08-12 10:49:29+00:00,9,61,@Uni_Rwanda,
2,Rwanda_Edu,The Minister of Education @Dr_Uwamariya held a...,2020-08-12 10:49:21+00:00,30,189,@Dr_Uwamariya @Uni_Rwanda,#Rwanda
3,REBRwanda,La formation des professeurs de français dans ...,2020-08-10 14:48:48+00:00,4,23,@ambafrancerwa @Rwanda_Edu @RwandaPolytec,
4,REBRwanda,En discussion avec les professeurs de français...,2020-08-10 14:41:41+00:00,6,22,@Rwanda_Edu,
5,REBRwanda,"Lors du lancement de la formation, le directeu...",2020-08-10 14:26:33+00:00,2,17,,
6,REBRwanda,"Aujourd'hui, le directeur général du Rwanda Ed...",2020-08-10 14:15:32+00:00,14,63,@Rwanda_Edu @RwandaPolytec,
7,Rwanda_Edu,"Munyarwanda, umurimo wose waba ukora, umuganda...",2020-08-10 13:52:41+00:00,24,108,,#TwiyubakireAmashuri
8,Rwanda_Edu,The ongoing countrywide school construction st...,2020-08-10 11:01:21+00:00,25,91,,
9,REBRwanda,"Mwiriwe neza Bwana Tom Ndahiro, iki gitabo uvu...",2020-08-09 11:58:14+00:00,7,41,,


In [296]:
# verifying that every tweet has text value
print((tweets_df['tweet']=="").value_counts())
# removing tweet without text as we cannot determine language
tweets_df = tweets_df[tweets_df['tweet']!=""]

False    413
True       1
Name: tweet, dtype: int64


In [297]:
"""
Function to detect the language of each Tweet and to return list of languages
""" 
def get_tweet_language(df, col):
    return df.apply(
        lambda x: detect(x[col]),
        axis=1
    )

In [298]:
# getting the list of detected languages for the dataframe
language_list = get_tweet_language(tweets_df, 'tweet')

In [299]:
# checking the distribution of languages detected
print(language_list.value_counts())

en    200
sw    199
id      4
fr      4
tl      3
hr      1
tr      1
sl      1
dtype: int64


In [300]:
# as Kinyarwanda is not in the list of languages to be detected by the langdetect package, 
# we consider other languages than English and French to be Kinyarwanda
kinyarwanda = ["sw", "id", "so", "tl", "sl", "tr"]
new_language_list = ['kr' if n in kinyarwanda else n for n in language_list]

In [301]:
# adding the list of languages detected to the dataframe
tweets_df['language'] = new_language_list

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  


In [328]:
# finding which tweets have relevant labels
covid_related = ['covid']
tweets_df['tweet_search'] = tweets_df['tweet'].str.lower()
tweets_df['tweet_type'] = tweets_df.tweet_search.apply(lambda sentence: all(word in sentence for word in covid_related))
print(tweets_df['tweet_type'].value_counts())

False    380
True      33
Name: tweet_type, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


In [333]:
tweets_df[tweets_df['tweet_type']]['tweet']

21     Mukiganiro na @rbarwanda , Umugenzuzi Mukuru m...
23     Mukiganiro na @rbarwanda , Umuyobozi Mukuru wa...
24     Aganira na @rbarwanda ,Umuyobozi Mukuru wa REB...
44     .@Dr_Uwamariya aganira n’abaturage bari mu mug...
53     The chairman of @PSF_Rwanda Mr Robert Bapfakur...
67     She also visited @IPRCMusanze and Muhabura Int...
75     .@Dr_Uwamariya, the Minister of Education repr...
76     Four major areas had been identified with spec...
77     The 1st EU-AU Research & Innovation Ministeria...
106    The inspection also aims to assess measures un...
113    Umunyamabanga wa Leta ushinzwe amashuri abanza...
154    .@Dr_Uwamariya :The Financial Year 2020/21 is ...
156    .@Dr_Uwamariya : Following the outbreak &amp; ...
161    Happening today 25 June 2020: There is a Forwa...
187    Simon, ntacyo bitwaye gusa musabwa kubahiriza ...
188    @Dr_Uwamariya : " Ntacyo byaba bimaze twubatse...
203    Dear students and parents, stay tuned to a liv...
204    Director General of Rwan

In [304]:
# exporting the tweets to a CSV file
tweets_df.to_csv('tweets_df.csv')

# 2) Scraping the Ministry Of Education website

In [9]:
# URL that contains the information to scrape
URL = "https://mineduc.gov.rw/index.php?id=166"

In [324]:
# getting the cookie access key
page = urllib.request.urlopen('https://mineduc.gov.rw/index.php?id=166')
page.read()

b"<html><script lang=javascript>\r\ndocument.cookie = '_accessKey2=Yp0iypgmHFIg2/Iwe6ydkYlrXqCnt0Z6'\r\nwindow.location.reload();\r\n</script></html>\r\n"

In [325]:
# scraping the page using the cookie access key from previous request
Cookies={'Cookie':r"_accessKey2=Yp0iypgmHFIg2/Iwe6ydkYlrXqCnt0Z6"}
Request=urllib.request.Request(url=URL,headers=Cookies)
cookieProcessor = urllib.request.HTTPCookieProcessor()
opener = urllib.request.build_opener(cookieProcessor)
response = opener.open(Request,timeout=100)
html_text = response.read()

In [327]:
# parsing the html content
soup = BeautifulSoup(html_text, 'html.parser')
headers = [p.get_text() for p in soup.find_all("b")]
text_page=soup.find('div').getText()
print(text_page)







info@mineduc.gov.rw

Hotline 2028 
 Staff Mail


 English  Kinyarwanda 




















































    $(document).ready(function (){
     var param1 = $(".previous-keyword").val();
      $('.new-search-keyword').val(param1);
    });     









MENU






HOMEABOUT USMinistry OfficialsVisionMission of the MinistryDirectoratesDirector General of Education PlanningStaffStaff contactsMEDIANewsPress ReleaseSpeechesRESOURCELawsEducation LawsEducation OrdersMinisterial instructionsPoliciesStatisticsStatistical Year BooksData Collection ToolsReportsESSPESSP 2018-2024ESSP2013-2018ESSP2008-2012ESSP2004-2008SCHOOL CALENDARSMAP OF SCHOOLSPrimary SchoolsSecondary Ordinal LevelSecondary Advanced  LevelSecondary SchoolsBoarding Secondary SchoolsTeacher Training CollegesTechnical Secondary SchoolsVocational Training CentersPolytechnicsHigher EducationProcurement PlanSERVICESRequirements for Setting up a Private Higher Learning InstitutionLooking for Rwanda Student Lo