# Extraction of Percentages from text

Here we will use regex to extract the percentages from the corpus

In [1]:
import re
import pandas as pd

In [2]:
# load in the data from the cleaned files
sentences = pd.read_csv("all_sentences.csv")
all_sentences = [x[0] for x in sentences.values if x is not None]

### Build regex that extracts percentages from string

Example formatting:

* X%
* X.X%
* X.XX%
* XX.XX% (etc)
* X percent (X can be numeric or english)
* X percentage points
* point X percent (where X is one, two, three, etc.)



In [3]:
exp1 = f"\d+(?:\.\d+)?(?:%| percent?)"
exp2 = f"\d+(?:\.\d+)?(?:%| percentage points?)"
# super ugly regex cause idk how to make them better
exp3 = f"(?:(?:one|two|three|four|five|six|seven|eight|nine)| \
(?:eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen)| \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)|(?: \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)-(?:one|two|three|four|five|six|seven|eight|nine))) \
percent?"
# super ugly regex round 2 cause idk how to make them better
exp4 = f"(?:(?:one|two|three|four|five|six|seven|eight|nine)| \
(?:eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen)| \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)|(?: \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)-(?:one|two|three|four|five|six|seven|eight|nine))) \
percentage points?"
exp5 = f"point (?:(?:one|two|three|four|five|six|seven|eight|nine)| \
(?:eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen)| \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)|(?: \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)-(?:one|two|three|four|five|six|seven|eight|nine))) \
percent?"
exp6 = f"point (?:(?:one|two|three|four|five|six|seven|eight|nine)| \
(?:eleven|twelve|thirteen|fourteen|fifteen|sixteen|seventeen|eighteen|nineteen)| \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)|(?: \
(?:twenty|thirty|forty|fifty|sixty|seventy|eighty|ninety)-(?:one|two|three|four|five|six|seven|eight|nine))) \
percentage points?"
exp7 = f"point \d+(?:\.\d+)?(?:%| percent?)"
exp8 = f"point \d+(?:\.\d+)?(?:%| percentage points?)"


matches = []
for sentence in all_sentences:
    percents1 = re.findall(exp1, sentence)
    percents2 = re.findall(exp2, sentence)
    percents3 = re.findall(exp3, sentence)
    percents4 = re.findall(exp4, sentence)
    percents5 = re.findall(exp5, sentence)
    percents6 = re.findall(exp6, sentence)
    percents7 = re.findall(exp7, sentence)
    percents8 = re.findall(exp8, sentence)
    
    percents = list(set(percents1 + percents2 + percents3 + percents4 + percents5 + percents6 + percents7 + percents8))
    
    if percents:
        for percentage in percents:
            matches.append(percentage)
            
found_percentages = list(set(matches))

### Output the found percentages to the file

In [4]:
pd.Series(found_percentages).to_csv("found_percentages.csv", index = False, header = False)