In [1]:
import pandas as pd



# Twitter Transparency Reports 

I downloaded India-specific reports from [Twitter's transparency center](https://transparency.twitter.com/en/reports/countries/in.html), cleaned and compiled them in Excel into a database. These reports began in 2012, as a way for Twitter to be transparent about "government requests that impact the public, whether through overt attempts at political censorship or by way of soliciting account data through information requests.” They include their rate of compliance to these requests and the number of users explicitly identified. 

In [4]:
removal_requests = pd.read_csv('TwitterTransparencyReportIndia_RemovalRequests.csv')
info_reports = pd.read_csv('TwitterTransparencyReportIndia_InformationReports.csv')


In [5]:
removal_requests.head()

Unnamed: 0,Time period,Legal demands: court orders,Other legal demands,Combined removal requests,% change,Court orders compliance rate,Other legal demands compliance rate,Combined compliance rate,Court orders accounts specified,Other legal demands accounts specified,Combined accounts specified,Combined accounts withheld,Combined tweets withheld,Court orders accounts TOS,Other legal demands accounts TOS,Combined accounts TOS
0,July - December 2020,15.0,6956,6971,151.0,73,9,9.0,220,18908,19128,60.0,598,2,1308,1310
1,January - June 2020,4.0,2768,2772,254.0,25,13.8,13.9,9,13191,13200,17.0,377,0,1159,1159
2,July - December 2019,7.0,775,782,55.0,85.7,36.3,36.7,1230,6604,7834,16.0,1481,4,988,992
3,January - June 2019,8.0,496,504,-24.0,-,-,,-,-,2484,73.0,241,-,-,578
4,July - December 2018,10.0,657,667,171.0,-,-,,-,-,2228,95.0,114,-,-,320


In [6]:
info_reports.head()

Unnamed: 0,Report,Information requests - Routine,Information requests - Emergency,Information requests - Combined,Compliance rate - Routine,Compliance rate - Emergency,Compliance rate - Combined,Accounts specified - Routine,Accounts specified - Emergency,Accounts specified - Combined,Preservation - Accounts,Preservation - Requests
0,July - December 2020,3463,152,3615,0.6,0.6,0.6,7508,254,7762,3877,1585
1,January - June 2020,2367,246,2613,1.0,0.8,1.0,5906,440,6346,2366,526
2,July - December 2019,662,127,789,1.8,0.7,1.6,2683,190,2873,1028,144
3,January - June 2019,395,79,474,5.3,3.8,5.0,1162,106,1268,145,36
4,July - December 2018,373,49,422,19.0,10.2,18.0,976,76,1052,100,30


# Scraping from the Lumen Database 

The [Lumen Database](https://lumendatabase.org/) is an initiative by Harvard University's Berkman Klein Center. It collects and analyzes legal complaints and requests for removal of online materials across the world. I ran an advanced search to obtain results for all content removal requests made from the Indian government or its subsidiaries to Twitter. 

I first made a list of paginated search links, used Selenium to scrape basic information about each legal notice from the results page. Then I used selenium to click through each link. Most of the results contained PDF or Word files that had to be downloaded, so I pulled the URL for each of these. I then obtained research credentials in order to download them separately. 

The code for each step is below.



In [None]:
links = ["https://lumendatabase.org/notices/search?utf8=%E2%9C%93&title=india+twitter&title-require-all=true&sort_by=",
        "https://lumendatabase.org/notices/search?page=2&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=3&sort_by=&title-require-all=true&title=india+twitter",
         'https://lumendatabase.org/notices/search?page=4&sort_by=&title-require-all=true&title=india+twitter',
         "https://lumendatabase.org/notices/search?page=5&sort_by=&title-require-all=true&title=india+twitter",
        "https://lumendatabase.org/notices/search?page=6&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=7&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=8&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=9&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=10&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=11&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=12&sort_by=&title-require-all=true&title=india+twitter",
         "https://lumendatabase.org/notices/search?page=13&sort_by=&title-require-all=true&title=india+twitter",
        "https://lumendatabase.org/notices/search?page=14&sort_by=&title-require-all=true&title=india+twitter"]


In [None]:
requests = []

for link in links:
    driver.get(link)

    results = driver.find_elements_by_tag_name('li')


    for each in results:
        titles = each.find_elements_by_class_name('title')

        for eachtitle in titles:
            url_long = eachtitle.find_elements_by_tag_name('a')
            for eachu in url_long:
                url = eachu.get_attribute('href')
                title = eachu.text

        metadata = each.find_elements_by_class_name('metadata')

        for eachm in metadata:

            date_rec = eachm.find_elements_by_class_name('date-received')
            for eachd in date_rec: 
                date_long = (eachd.find_elements_by_tag_name('time'))
                for each in date_long:
                    date_received = (each.get_attribute('datetime'))

            date_sub = eachm.find_elements_by_class_name('date-submitted')
            for eachd in date_sub: 
                date_long2 = (eachd.find_elements_by_tag_name('time'))
                for each in date_long2:
                    date_submitted = (each.get_attribute('datetime'))

            sender_receiver = eachm.find_elements_by_class_name('sender-receiver')
            for eachs in sender_receiver: 
                sender_long = (eachs.find_elements_by_class_name('sender'))
                for each in sender_long:
                    sender = (each.text)
                receiver_long = (eachs.find_elements_by_class_name('receiver'))
                for each in receiver_long:
                    receiver = (each.text)

            excerpt_full = eachm.find_elements_by_class_name('excerpt')
            for eache in excerpt_full:
                excerpt = (eache.text)



            request = {
                    'url': url,
                    'title_text': title,
                    'date_received' : date_received,
                    'sender': sender,
                    'receiver': receiver,
                    'date_submitted': date_submitted,
                    'excerpt': excerpt
            }

            requests.append(request)

noticedf = pd.DataFrame(requests)

#### Clicking through each notice URL to get the PDF URL

In [None]:
urls = noticedf.url.tolist()

In [None]:
pdflist = []

for link in urls:
    driver.get(link)

    # driver.get("https://lumendatabase.org/notices/22220327")

    results = driver.find_elements_by_class_name('attachments')
    # print (results)

    for each in results:
        docs = each.find_elements_by_class_name('document')
        for eachdoc in docs:
            hrefs = eachdoc.find_elements_by_tag_name('a')
            for eachhref in hrefs:
                if 'doc' in eachhref.get_attribute('href'):
                    url = eachhref.get_attribute('href')
                    name = link
                    
                    
        pdfs = each.find_elements_by_class_name('pdf')
        for eachpdf in pdfs:
            hrefs = eachpdf.find_elements_by_tag_name('a')
            for eachhref in hrefs:
                 if 'pdf' in eachhref.get_attribute('href'):
                    url = eachhref.get_attribute('href')
                    name = link

                    pdfordoc = {
                        'pdf_url': url,
                        'notice_url': name

                        }

                    pdflist.append(pdfordoc)

pdfs = pd.DataFrame(pdflist)

#### Joining the dataframe with PDF links to the first dataframe, and some basic cleaning

In [None]:
lumendf = df.merge(pdf_df, how='left', left_on='url', right_on='notice_url')


In [None]:
lumendf.date_received = pd.to_datetime(lumendf.date_received.str.extract(r'([\d\-]+)T'))
lumendf.date_submitted = pd.to_datetime(lumendf.date_submitted.str.extract(r'([\d\-]+)T'))

lumendf.head()

In [None]:
# lumendf.to_csv('all_lumen_notices_final.csv')

#### Downloading the PDFs locally

Please note: The research credentials in this code are no longer valid; to replicate, make a request with the Lumen database to obtain a researcher account. 

In the following step, I extract all the text from these PDFs and upload the completed file so it can be viewed.

In [None]:
all_pdfs = lumendf.pdf_url.tolist()

In [None]:
from selenium.webdriver.chrome.options import Options

options = Options()
options.add_experimental_option('prefs',  {
    "plugins.always_open_pdf_externally": True
})

driver = webdriver.Chrome(ChromeDriverManager().install(), options=options)
    


In [None]:
driver.get("https://lumendatabase.org/")

signin = driver.find_element_by_xpath("/html/body/footer/div/div[2]/nav/span[5]/a").click()

driver.find_element_by_xpath("/html/body/section/div/div[2]/form/div[1]/input").send_keys("paroma.soni@columbia.edu")

driver.find_element_by_xpath("/html/body/section/div/div[2]/form/div[2]/input").send_keys("Lumen_Researcher_2021_PS")

driver.find_element_by_xpath("/html/body/section/div/div[2]/form/div[3]/input").click()


In [None]:
for url in all_pdfs:
#   driver = webdriver.Chrome(options=options)
    try:
        driver.get(url)
    except:
        pass

In [9]:
lumendf = pd.read_csv('all_lumen_notices_final.csv')

In [10]:
lumendf['pdf_filename'] = lumendf.pdf_url.str.extract(r'original\/([\w\W\d]*.[pdfdocx])')


Not all downloaded PDFs matched the filename in their respective URLs. So I changed the directory to the folder with the PDFs, and used `ls` on the command line to obtain a list of the filenames, which I saved into an Excel file.

In [None]:
lumen_filelist = pd.read_excel('lumen_filelist.xlsx')

In [None]:
#lumen_df.merge(lumen_filelist, how='left', left_on='pdf_filename', right_on='lumen_filelist').to_csv('lumen notices with pdf name.csv')


I then merged the two and saved it as a .csv file, manually going through it to correct the ones that were different. I then ran the PDFs which did _not_ contain searchable text into Adobe Acrobat's batch OCR tool, and saved those with an "\_OCR" suffix.

In [None]:
lumen_filelist_OCR = lumen_filelist.replace(to_replace ='.pdf', value = '_OCR.pdf', regex = True)
lumen_filelist_OCR = lumen_filelist_OCR.lumen_filelist.to_list()


In [None]:
import PyPDF2

textfromfiles = []
pagenum = -1 

for eachfile in lumen_filelist_OCR:
    try:
        pdfFileObj = open(f'LumenOCR/{eachfile}', 'rb')
        pdfReader = PyPDF2.PdfFileReader(pdfFileObj)
        count = (pdfReader.numPages)

        numlist = list(range(count))
        textlist = []  

        for num in numlist:
            pageObj = pdfReader.getPage(num)

            try:
                text=(pageObj.extractText())
                text=text.split(",")
        #         text
            except:
                text=f'{eachfile} on page number {num+1} had an error.'

            if num+1 == count:
                textlist.append(text)
            else:
                text2 = text

                textlist.append(text2)
    except:
        eachfile = 'not found'
            
    eachpdftext = {
            'pdf_name': eachfile,
#             'page_number': num+1,
            'max_pages': count,
            'text': textlist
            
        }
        
    textfromfiles.append(eachpdftext)
        
textfromfiles

df2 = pd.DataFrame(textfromfiles)


Despite the OCR successfully converting the scanned images into searchable text, PyPDF still returned several errors when trying to extract that text. Many of these files – both OCR and plain text – also contained multiple languages, including Hindi and Telegu, which text extraction libraries had difficulty identifying. 

I proceeded to manually copy paste and clean the text from these PDFs, adding them to the Excel file below.

In [14]:
textdf = pd.read_excel('lumen_fulltext.xlsx')
textdf.relevant = textdf.relevant.fillna('y')
textdf = textdf[textdf.relevant == 'y']
textdf.head()

Unnamed: 0,pdf_filename,lumen_filelist,date_received,text_from_pdf,OCR_yn,relevant,num_tweets
0,_India__Lumen_Notice_for_Notice___Takedown_Req...,_India__Lumen_Notice_for_Notice___Takedown_Req...,2018-05-15,Twitter Receipt of Complaint _________________...,,y,2
6,0113609641.pdf,0113609641.pdf,2019-05-09,"File No. 491/Social Media/2019 Dated: 09 May, ...",,y,9
7,0113614419.pdf,0113614419.pdf,2019-05-09,"File No. 491/Social Media/2019 Dated: 09 May, ...",,y,1
8,05_16_19_ECI_Order.pdf,05_16_19_ECI_Order.pdf,2019-05-16,"File No. 491/Social Media/2019 Dated: 16 May, ...",,y,36
9,05_May__2019_Twitter_TDR_on_as.pdf,05_May__2019_Twitter_TDR_on_as.pdf,2019-05-05,ElectionCommissionofIndia\nFile No. 491/Social...,y,y,0


Merging the extracted text with the original Lumen database.

In [15]:
df = lumendf.merge(textdf, how='right',left_on='pdf_filename', right_on='pdf_filename').drop_duplicates()



In [27]:
df = df[['url', 'title_text', 'date_received_x', 'sender','receiver', 'date_submitted', 'pdf_url', 'lumen_filelist', 'text_from_pdf']]



Extracting the Twitter URLs and adding them to the dataframe, as well as the number of tweets flagged in each legal notice:

In [23]:
import re

In [24]:
df['tweetlist'] = df.text_from_pdf.apply(lambda txt: re.findall("https?://twitter.com[^\s]*", txt))


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['tweetlist'] = df.text_from_pdf.apply(lambda txt: re.findall("https?://twitter.com[^\s]*", txt))


In [25]:
tweetcount = []
for each in (df.tweetlist):
    tweetcount.append(len(each))

df['num_of_tweets'] = tweetcount   


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['num_of_tweets'] = tweetcount


In [26]:
df.head()

Unnamed: 0,url,title_text,date_received_x,sender,receiver,date_submitted,pdf_url,lumen_filelist,text_from_pdf,tweetlist,num_of_tweets
0,https://lumendatabase.org/notices/16562206,Legal Request to Twitter from India - Ministry...,2018-05-15,Ministry of Electro…,Twitter,2018-05-24,https://lumendatabase.org/file_uploads/files/4...,_India__Lumen_Notice_for_Notice___Takedown_Req...,Twitter Receipt of Complaint _________________...,"[https://twitter.com/TheVoiceKashmir, https://...",2
1,https://lumendatabase.org/notices/16654365,Legal Request to Twitter from India - Ministry...,2018-05-31,Ministry of Electro…,Twitter,2018-06-07,https://lumendatabase.org/file_uploads/files/4...,_India__Lumen_Notice_for_Notice___Takedown_Req...,Twitter Receipt of Complaint _________________...,"[https://twitter.com/TheVoiceKashmir, https://...",2
2,https://lumendatabase.org/notices/18523843,Legal Request to Twitter from India - Election...,2019-05-09,Election Commission…,Twitter,2019-05-09,https://lumendatabase.org/file_uploads/files/4...,0113609641.pdf,"File No. 491/Social Media/2019 Dated: 09 May, ...",[https://twitter.com/TarunAg79908414/status/11...,9
3,https://lumendatabase.org/notices/18523843,Legal Request to Twitter from India - Election...,2019-05-09,Election Commission…,Twitter,2019-05-09,https://lumendatabase.org/file_uploads/files/4...,0113609641.pdf,"File No. 491/Social Media/2019 Dated: 09 May, ...",[https://twitter.com/TarunAg79908414/status/11...,9
4,https://lumendatabase.org/notices/18524111,Legal Request to Twitter from India - Election...,2019-05-09,Election Commission…,Twitter,2019-05-09,https://lumendatabase.org/file_uploads/files/4...,0113614419.pdf,"File No. 491/Social Media/2019 Dated: 09 May, ...",[https://twitter.com/IAm_Sanjaysri/status/1125...,1


Saving it as a list of tweets as well, to use the Twitter API on.

In [None]:
df['split_text'] = df.text_from_pdf.str.split(r'[ \n]')

In [None]:
listoftweets = []
for each in df.split_text:
    for e in each:
        if 'http' in e:
            listoftweets.append(e)

listoftweets

# Using Tweepy (Twitter API) to get Tweet and User Information
I created an Excel sheet with the list of tweets above, as well as all the column names that I wanted to populate using Twitter API. I cleaned it into a dataframe I could use in Pandas.

In [30]:
tweet_text = pd.read_excel('tweet_text.xlsx')
tweet_text['username'] = tweet_text.tweetlist.str.extract(r'twitter.com/([\w\d_]+)')
alltweetslist = tweet_text[['tweetlist', 'username']]
alltweetslist['tweet_id'] = alltweetslist.tweetlist.str.extract(r'status/([\d\w\W]+)$')

alltweetslist.head(3)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  alltweetslist['tweet_id'] = alltweetslist.tweetlist.str.extract(r'status/([\d\w\W]+)$')


Unnamed: 0,tweetlist,username,tweet_id
0,https://twitter.com/TheVoiceKashmir,TheVoiceKashmir,
1,https://twitter.com/NayeemDass,NayeemDass,
2,https://twitter.com/TarunAg79908414/status/112...,TarunAg79908414,1.1259102158559683e+18


I also used a pivot table in Excel to conver the original Lumen dataframe to include each Twitter URL in a separate row, so I could merge my list of tweets with their corresponding legal notice.

In [31]:
lumen_links = pd.read_excel('lumen_with_twitter_links_perrow.xlsx')

In [33]:
all_tweets = alltweetslist.merge(lumen_links, how='left', left_on='tweetlist', right_on='tweetlist')
all_tweets.head(2)

Unnamed: 0,tweetlist,username,tweet_id,url,date_received_x,lumen_filelist,num_of_tweets
0,https://twitter.com/TheVoiceKashmir,TheVoiceKashmir,,https://lumendatabase.org/notices/16562206,2018-05-15,_India__Lumen_Notice_for_Notice___Takedown_Req...,2.0
1,https://twitter.com/TheVoiceKashmir,TheVoiceKashmir,,https://lumendatabase.org/notices/17135197,2018-08-15,Copy_of_Lumen_India.pdf,6.0


#### Using Tweepy to first find information about each user, and then each tweet

In [None]:
import tweepy

auth = tweepy.OAuthHandler #(access key, access key secret)

#access token, access token secret
auth.set_access_token #(access key, access key secret)

api = tweepy.API(auth,wait_on_rate_limit=False)


In [None]:
userdf_list = []

for each in tweet_text.username:
#     print (each)

    try:
        user = api.get_user(each)
        
        user_name = user.name
        screen_name = user.screen_name
        user_desc = user.description
        status_count = user.statuses_count
        following_count = user.friends_count
        followers_count = user.followers_count
        
        eachuser = {
        'screen_name': user_name,
        'twitter_handle': screen_name,
        'user_desc': user_desc,
       ' status_count': status_count,
        'following_count': following_count,
        'followers_count': followers_count,
        'active_status': 'active'
    }
        
    except tweepy.TweepError as e:
        error = e
         
        eachuser = {
        'twitter_handle': each,
        'active_status': error,
        }
    
    
    userdf_list.append(eachuser)
    
df_users = pd.DataFrame(userdf_list)

In [None]:
full_tweet_text = all_tweets.merge(df_users_final, how='left', left_on='username', right_on='twitter_handle').drop_duplicates()

full_tweet_text

In [None]:
statusdflist = []

for each in full_tweet_text.tweet_id2[:600]:
#     print (each)

    try:
        status = api.get_status(each, tweet_mode="extended")
        status_text = (status.full_text)
        
    except tweepy.TweepError as e:
        status_text = e
    
    eachstatus = {
        'tweet_id': each,
        'tweet_text': status_text
    }
    
    statusdflist.append(eachstatus)

df_tweets = pd.DataFrame(statusdflist)


In [None]:
final_df = full_tweet_text.merge(df_tweets, how='left', left_on='tweet_id', 
                     right_on='tweet_id')

We discovered that Twitter’s compliance is country-specific, meaning that withheld tweets are still visible if the Twitter user changes their country to any other one. So I ran the above code once again – this time changing my Twitter account's locaiton to the United States as my country. I then built a database of what those specific tweets looked like in India versus outside of India.

The final database is below.

In [34]:
final_df = pd.read_excel('Final_all_tweets.xlsx')

In [35]:
final_df.head()

Unnamed: 0.1,Unnamed: 0,flagged_twitter_url,flagged_tweet_id,username_handle,flagged_date,lumen_filename,lumen_url,screen_name,user_desc,status_count,following_count,followers_count,active_status,tweet_id2,us_tweet_text,india_tweet_text
0,0,https://twitter.com/AMIT_GUJJU/status/11285815...,1128581518295613441,AMIT_GUJJU,2019-05-16,05_16_19_ECI_Order.pdf,,Amit Kumar,"Passionate Follower of Politics, Public Policy...",71632.0,1316.0,68101.0,active,1128581518295613441,Sources: mera exit poll\nU.P.; 58\nBihar; 16\n...,This Tweet from @AMIT_GUJJU has been withheld ...
1,1,https://twitter.com/bk_chudasama/status/112859...,1128592137350713344,bk_chudasama,2019-05-16,05_16_19_ECI_Order.pdf,,Ranjitsinh Chudasama,Convener Social Media BJP JAMNAGAR (Dist) Hono...,110136.0,917.0,14910.0,active,1128592137350713344,Code 144: No status found with that ID.,No status found with that ID (Code 144)
2,2,https://twitter.com/Dehaati_Indian/status/1128...,1128582379369426944,Dehaati_Indian,2019-05-16,05_16_19_ECI_Order.pdf,,CovidWorrior,"किसान पुत्र, \nDoctor by Profession.",20037.0,469.0,1014.0,active,1128582379369426944,Code 144: No status found with that ID.,No status found with that ID (Code 144)
3,3,https://twitter.com/sunnydeolBJP/status/112859...,1128599916731781121,sunnydeolBJP,2019-05-16,05_16_19_ECI_Order.pdf,,,,,,,Code 63: User has been suspended.,1128599916731781121,Code 63: User has been suspended.,This Tweet from @mahindrbahubali has been with...
4,4,https://twitter.com/CA_keshavKumar/status/1128...,1128652376020094976,CA_keshavKumar,2019-05-16,15_May__2019_Twitter_TDR_on_Restriction_of_Pub...,,,,,,,Code 50: User not found.,1128652376020094976,Code 144: No status found with that ID.,No status found with that ID (Code 144)


### Assigning categories to available tweets
Filtering out data without tweet text (including only accounts in the URL) and attributing categories based on commonly occurring and relevant words

In [None]:
df_tweettext = final_df[~(final_df.us_tweet_text.fillna('Code').str.contains('Code'))].sort_values(by='flagged_date', ascending=False)


In [None]:
conditions = [(df_tweettext.us_tweet_text.str.contains('kashmir|burhan|martyr|#freedom|azaadi|kasmir|शहीद', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('exit poll|EVM|election|vote|referendum', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('farmer|kisaan|farm', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('covid|corona|vaccine|vaccination|social distanc|PMCares|pandemic|#ModiMadeDisaster|कोरोना|कुंभ|बनारस|श्मशान', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('CAA|NRC|citizenship amendment act|शाहीन|बाग|shaheen|bagh', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('Muslim|Islam|Hindu extremist|sikh|terrorist|masjid|pakistan|hindutva|mosque|cow|ghaziabad|khalistan|पाकिस्तान', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('rape|Hathras|Manisha|valmiki|gay|justice|हाथरस|मनीषा', case=False, na=False)),
              (df_tweettext.us_tweet_text.str.contains('protest|slogan|naxal|democrat|riot|yogi|maoist|regime|imperialis|police|army|जवान', case=False, na=False)) 
             ]

# sequential list of values to assign for each condition
values = ['Kashmir','Election', 'Farmer Protests', 'COVID-19', 'CAA/NRC', 'Religious/Anti-National', 'Sexual Violence', 'Politics/Riots']

In [None]:
df_tweettext['category'] = np.select(conditions, values)

In [None]:
df_tweettext ['is_modi'] = np.where(df_tweettext.us_tweet_text.str.contains('Modi|PM|BJP|RSS|Narendra|मोदी', case=False, na=False), True, False)



In [None]:
df_tweettext.head()