## Open text file

In [2]:
import pandas as pd

In [None]:
df = pd.read_fwf('nprhealth.txt', header=None)
df.head(3)

Unnamed: 0,0,1,2,3,4,5
0,547824165185536000|Wed,Dec,24,18:40:33,0,2014|Would You Like Health Insurance With Thos...
1,547763897638199296|Wed,Dec,24,14:41:04,0,2014|New Blood Donation Rules Would Still Excl...
2,547671560471859200|Wed,Dec,24,08:34:09,0,2014|Obama Administration Downplays Court Chal...


In [234]:
df = df[[5]]
df.head(3)

Unnamed: 0,5
0,2014|Would You Like Health Insurance With Thos...
1,2014|New Blood Donation Rules Would Still Excl...
2,2014|Obama Administration Downplays Court Chal...


## just words - remove numbers

In [235]:
def just_text (sentence):
    just_text = ''.join(word for word in sentence if not word.isdigit())
    return just_text

In [236]:
df1 = df.applymap(lambda x: just_text(x))

In [237]:
df1.head(3)

Unnamed: 0,5
0,|Would You Like Health Insurance With Those St...
1,|New Blood Donation Rules Would Still Exclude ...
2,|Obama Administration Downplays Court Challeng...


## remove punctuation

In [246]:
type(df1)

pandas.core.frame.DataFrame

In [238]:
import string 
def remove_punctuation(text):
    punctuationfree="".join([i for i in text if i not in string.punctuation])
    return punctuationfree

In [247]:
df2 = df1.applymap(lambda x: remove_punctuation(x)) 

In [248]:
df2

Unnamed: 0,5
0,Would You Like Health Insurance With Those Sto...
1,New Blood Donation Rules Would Still Exclude M...
2,Obama Administration Downplays Court Challenge...
3,Christmas In Liberia Ebola Fears No Snow Holid...
4,Costly Hepatitis C Drugs Threaten To Bust Pris...
...,...
4832,Microbes Benefit More Than Just The Gut httpnp...
4833,Hows Your Cholesterol The Crowd Wants To Know ...
4834,How African Cattle Herders Wiped Out An Ancien...
4835,Death Toll Climbs In Congo Ebola Outbreak http...


## lower case

In [254]:
def lower_all(x):
    lower_str = x.lower()
    return lower_str

df3 = df2.applymap(lambda x: lower_all(x))
df3.head(3)

Unnamed: 0,5
0,would you like health insurance with those sto...
1,new blood donation rules would still exclude m...
2,obama administration downplays court challenge...


## Stopwords & tokenizing

In [257]:
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('english'))

def tokenizer_stopword_remover(text):
    word_tokens = word_tokenize(text) 
    important_text = [w for w in word_tokens if not w in stop_words] 
    return important_text

df4 = df3.applymap(lambda x: tokenizer_stopword_remover(x))
df4.head(3)

Unnamed: 0,5
0,"[would, like, health, insurance, stocking, stu..."
1,"[new, blood, donation, rules, would, still, ex..."
2,"[obama, administration, downplays, court, chal..."


## Lemmatize

In [262]:
from nltk.stem import WordNetLemmatizer
lemma = WordNetLemmatizer()

def get_lemm(l):
    lemm = [lemma.lemmatize(word) for word in l]
    return lemm

In [263]:
df5 = df4.applymap(lambda x: get_lemm(x))
df5.head(3)

Unnamed: 0,5
0,"[would, like, health, insurance, stocking, stu..."
1,"[new, blood, donation, rule, would, still, exc..."
2,"[obama, administration, downplays, court, chal..."


## Stemmer - not good, stupid output
 i'd guess GPT knows how to handle words that are inflected

In [264]:
from nltk.stem.porter import PorterStemmer
stemmer = PorterStemmer()

def get_stemmed(list_of_words):
        stemmed = [stemmer.stem(word) for word in list_of_words]
        return stemmed

In [266]:
df6 = df5.applymap(lambda x: get_stemmed(x))
df6.head(8)

Unnamed: 0,5
0,"[would, like, health, insur, stock, stuffer, h..."
1,"[new, blood, donat, rule, would, still, exclud..."
2,"[obama, administr, downplay, court, challeng, ..."
3,"[christma, liberia, ebola, fear, snow, holiday..."
4,"[costli, hepat, c, drug, threaten, bust, priso..."
5,"[float, toilet, clean, grow, lake, httpnprcwbbh]"
6,"[thin, doesnt, spare, asianamerican, diabet, r..."
7,"[keep, marketplac, insur, enrol, medicar, http..."


## other ways to open a text file

In [33]:
text = open ('nprhealth.txt')
text = text.readlines()
text

['547824165185536000|Wed Dec 24 18:40:33 +0000 2014|Would You Like Health Insurance With Those Stocking Stuffers? http://n.pr/1CKulog\n',
 '547763897638199296|Wed Dec 24 14:41:04 +0000 2014|New Blood Donation Rules Would Still Exclude Many Gay Men http://n.pr/1CJ9IsG\n',
 '547671560471859200|Wed Dec 24 08:34:09 +0000 2014|Obama Administration Downplays Court Challenge To Health Law http://n.pr/1CHaDd6\n',
 '547671553609981952|Wed Dec 24 08:34:08 +0000 2014|Christmas In Liberia: Ebola Fears, No Snow, Holiday Spirit http://n.pr/1CHaAxQ\n',
 '547671544680312834|Wed Dec 24 08:34:06 +0000 2014|Costly Hepatitis C Drugs Threaten To Bust Prison Budgets http://n.pr/1CHaCFX\n',
 '547463921431375872|Tue Dec 23 18:49:04 +0000 2014|Floating Toilets That Clean Themselves Grow On A Lake http://n.pr/1CwBBH8\n',
 "547426777761476608|Tue Dec 23 16:21:29 +0000 2014|Being Thin Doesn't Spare Asian-Americans From Diabetes Risk http://n.pr/1CvRzBi\n",
 '547414722568413184|Tue Dec 23 15:33:34 +0000 2014|Can I

## Regex

In [196]:
import re

In [215]:
match = re.search(r"(?=\|).*(?= http)",t)
match.group()

'| biudbcu| hello'

In [157]:
m.group()

'hello'

In [194]:
t2 = open ('nprhealth.txt')
t2 = t2.readlines()
t2

['585978391360221184|Thu Apr 09 01:31:50 +0000 2015|Breast cancer risk test devised http://bbc.in/1CimpJF\n',
 '585947808772960257|Wed Apr 08 23:30:18 +0000 2015|GP workload harming care - BMA poll http://bbc.in/1ChTBRv\n',
 "585947807816650752|Wed Apr 08 23:30:18 +0000 2015|Short people's 'heart risk greater' http://bbc.in/1ChTANp\n",
 "585866060991078401|Wed Apr 08 18:05:28 +0000 2015|New approach against HIV 'promising' http://bbc.in/1E6jAjt\n",
 "585794106170839041|Wed Apr 08 13:19:33 +0000 2015|Coalition 'undermined NHS' - doctors http://bbc.in/1CnLwK7\n",
 '585733482413891584|Wed Apr 08 09:18:39 +0000 2015|Review of case against NHS manager http://bbc.in/1Ffj6ci\n',
 "585733481608646657|Wed Apr 08 09:18:39 +0000 2015|VIDEO: 'All day is empty, what am I going to do?' http://bbc.in/1N7wSSz\n",
 "585701601131765761|Wed Apr 08 07:11:58 +0000 2015|VIDEO: 'Overhaul needed' for end-of-life care http://bbc.in/1CmrRu3\n",
 "585620828110397440|Wed Apr 08 01:51:00 +0000 2015|Care for dying 

In [192]:
import re
#Define the search term:
pattern = r"(?<=\|)(.*)(?= http)" #pattern must be enclosed in quotes

#Create an empty list:
data = []

#then

for line in open(r'nprhealth.txt'):
    if line !='':  #<-- To make sure the whole file is read
        word = re.findall(pattern, line)
        data.append(str(word))

        

In [193]:
data

["['Thu Apr 09 01:31:50 +0000 2015|Breast cancer risk test devised']",
 "['Wed Apr 08 23:30:18 +0000 2015|GP workload harming care - BMA poll']",
 '["Wed Apr 08 23:30:18 +0000 2015|Short people\'s \'heart risk greater\'"]',
 '["Wed Apr 08 18:05:28 +0000 2015|New approach against HIV \'promising\'"]',
 '["Wed Apr 08 13:19:33 +0000 2015|Coalition \'undermined NHS\' - doctors"]',
 "['Wed Apr 08 09:18:39 +0000 2015|Review of case against NHS manager']",
 '["Wed Apr 08 09:18:39 +0000 2015|VIDEO: \'All day is empty, what am I going to do?\'"]',
 '["Wed Apr 08 07:11:58 +0000 2015|VIDEO: \'Overhaul needed\' for end-of-life care"]',
 '["Wed Apr 08 01:51:00 +0000 2015|Care for dying \'needs overhaul\'"]',
 "['Tue Apr 07 13:41:42 +0000 2015|VIDEO: NHS: Labour and Tory key policies']",
 "['Tue Apr 07 13:41:42 +0000 2015|Have GP services got worse?']",
 "['Tue Apr 07 09:38:39 +0000 2015|A&amp;E waiting hits new worst level']",
 "['Tue Apr 07 03:30:52 +0000 2015|Parties row over GP opening hours']",

In [224]:
s = []

In [227]:
pat = re.compile(r'(?<=\|)(.*)(?= http)')
for line in data:
    g = pat.match(line)
    s.append(g)

## Read-in a a bunch of text files

In [None]:
files = os.listdir("/home/mz/Scripts/data/Health-Tweets")
for file in files.glob('*.txt'):

In [272]:
import os
# your_path = 'some_path'
files = os.listdir("/home/mz/Scripts/data/Health-Tweets")

In [273]:
files

['BABY_DataCleaning.ipynb',
 'nytimeshealth.txt',
 'nprhealth.txt',
 'Untitled.ipynb',
 'foxnewshealth.txt',
 'goodhealth.txt',
 'usnewshealth.txt',
 'NBChealth.txt',
 'msnhealthnews.txt',
 'cbchealth.txt',
 'everydayhealth.txt',
 'gdnhealthcare.txt',
 'latimeshealth.txt',
 'bbchealth.txt',
 '.ipynb_checkpoints',
 'cnnhealth.txt',
 'KaiserHealthNews.txt',
 'wsjhealth.txt',
 'reuters_health.txt']

In [298]:
import os
from pathlib import Path

your_path = '/home/mz/Scripts/data/Health-Tweets'

concat_text = pd.DataFrame()

for file in Path(your_path).rglob('*.txt'):
    act_text = pd.read_fwf(os.path.join(your_path, file), header = None)
    concat_text.concat(act_text)

AttributeError: 'DataFrame' object has no attribute 'concat'