From 1daa587abe6957c127d0b29e8c93b6ae98127735 Mon Sep 17 00:00:00 2001 From: Peter Organisciak Date: Tue, 16 May 2023 17:46:06 -0600 Subject: [PATCH] Allow for an alternative correct answer --- labs/08-lab-regular-expressions.ipynb | 1979 ++++++++++++------------- 1 file changed, 975 insertions(+), 1004 deletions(-) diff --git a/labs/08-lab-regular-expressions.ipynb b/labs/08-lab-regular-expressions.ipynb index 4c9851b..f498e11 100644 --- a/labs/08-lab-regular-expressions.ipynb +++ b/labs/08-lab-regular-expressions.ipynb @@ -1,1010 +1,981 @@ { - "cells": [ - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "colab_type": "text", - "id": "view-in-github" - }, - "source": [ - "\"Open" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "Zuf8EoDzwEem" - }, - "source": [ - "# Week 8 Lab: Regular Expressions" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "id": "UquVNqG4wEeo" - }, - "outputs": [], - "source": [ - "import re\n", - "import pandas as pd" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "OXHoXToPwEer" - }, - "source": [ - "Here is a synopsis of the best film of our time, *Paddington*:" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 69 - }, - "id": "PhMVBwhpwEes", - "outputId": "ebb331af-4ec3-4039-a2af-4fc4bbc37802" - }, - "outputs": [ - { - "data": { - "application/vnd.google.colaboratory.intrinsic+json": { - "type": "string" - }, - "text/plain": [ - "'After a deadly earthquake destroys his home in Peruvian rainforest, a young bear (Ben Whishaw) makes his way to England in search of a new home. The bear, dubbed \"Paddington\" for the london train station, finds shelter with the family of Henry (Hugh Bonneville) and Mary Brown (Sally Hawkins). Although Paddington\\'s amazement at urban living soon endears him to the Browns, someone else has her eye on him: Taxidermist Millicent Clyde (Nicole Kidman) has designs on the rare bear and his hide.'" - ] - }, - "execution_count": 1, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "synopsis = '''After a deadly earthquake destroys his home in the Peruvian rainforest, a young bear (Ben Whishaw) makes his way to England in search of a new home. The bear, dubbed \"Paddington\" for the london train station, finds shelter with the family of Henry (Hugh Bonneville) and Mary Brown (Sally Hawkins). Although Paddington's amazement at urban living soon endears him to the Browns, someone else has her eye on him: Taxidermist Millicent Clyde (Nicole Kidman) has designs on the rare bear and his hide.'''\n", - "synopsis" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "mzj3T_AUwEez" - }, - "source": [ - "For these questions, use `re.findall(pattern, text)`, like so:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 34 - }, - "id": "J6oZ8HVcwEe0", - "outputId": "77adfe21-3c6f-4443-d744-0b3a75b5daf1" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "['After ', 'Henry ', 'Brown ', 'Sally ', 'Clyde ']" - ] - }, - "execution_count": 6, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "re.findall('[A-Z]....\\s', synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For many of the questions, I'll ask you for the `pattern` that goes into that search." - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "IqAdp-3iwEe5" - }, - "source": [ - "### Example Question\n", - "\n", - "- Q0.0: Write the regular expression to find all three or four-letter words:" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": { - "cellView": "form", - "colab": { - "base_uri": "https://localhost:8080/" - }, - "id": "Ub9b8TUEwEe6", - "outputId": "a915df70-cf14-497f-db87-3e4334eff5b2" - }, - "outputs": [ - { - "data": { - "text/plain": [ - "[' his ',\n", - " ' bear ',\n", - " ' his ',\n", - " ' new ',\n", - " ' The ',\n", - " ' for ',\n", - " ' with ',\n", - " ' and ',\n", - " ' soon ',\n", - " ' him ',\n", - " ' the ',\n", - " ' else ',\n", - " ' her ',\n", - " ' has ',\n", - " ' the ',\n", - " ' bear ',\n", - " ' his ']" - ] - }, - "execution_count": 5, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" - } - ], - "source": [ - "#@markdown Run this cell to see if it works\n", - "q0_answer = '\\s\\w\\w\\w\\w?\\s' #@param {type:'string'}\n", - "re.findall(q0_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "dVtGDijDwEfE" - }, - "source": [ - "## Questions #1: Matching Characters (20 pts)\n", - "\n", - "- Q1a: Write a regular expression to match all three-character words surrounded by spaces. You need to change the answer in the form field and run it to see if it works. (5pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ZIIqEYxXwEfJ", - "scrolled": true - }, - "outputs": [], - "source": [ - "#@markdown *Run this cell to see if it works*\n", - "q1a_answer = '' #@param {type:'string'}\n", - "re.findall(q1a_answer, synopsis) # This runs your pattern" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "esCPsEnjwEfM" - }, - "source": [ - "- Q1b: Write a regular expression to match all three-character words, allowing for instances like `(Ben` and `him:`. (5pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "qt8_JQnxwEfM" - }, - "outputs": [], - "source": [ - "#@markdown *Run this cell to see if it works*\n", - "q1b_answer = '' #@param {type:'string'}\n", - "re.findall(q1b_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "AuFhWEX-wEfQ" - }, - "source": [ - "- Q1c: Write the regular expression to determine how many times the words `the` or `The` show up. (5pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "R5LxSp3ewEfQ" - }, - "outputs": [], - "source": [ - "q1c_answer = '' #@param {type:'string'}\n", - "matches = re.findall(q1c_answer, synopsis)\n", - "print(matches)\n", - "print(\"Count\", len(matches))" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "KSxzA382wEfT" - }, - "source": [ - "- Q1d: Write a regular expression to find the context at the end of a sentence: the five characters leading up to the period. Results should be `[' home.', 'kins).', ' hide.']` (5pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "seEtUQAFwEfU" - }, - "outputs": [], - "source": [ - "q1d_answer = '' #@param {type:'string'}\n", - "re.findall(q1d_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "dTJiDjSpwEfX" - }, - "source": [ - "## Questions #2: Repeating Patterns (36 pts)\n", - "\n", - "*Full Reference*\n", - "\n", - "**Matching characters**\n", - "- `a` - Match the letter `a`. Same for most other characters\n", - "- `.` - Match any single character\n", - "- `\\w` - Match any word character (letters, number... support for non-English characters varies)\n", - "- `\\W` - Match any non-word characters\n", - "- `\\d` - Match any digit\n", - "- `.` - Matches *any* character\n", - "- `\\.` - Matches a literal period (or `\\\\` matches literal backslash, `\\(` matches literal parenthesis, etc)\n", - "- `\\s` - Match any whitespace character (space, tabs, line breaks sometimes)\n", - "\n", - "**Multiple Matches**\n", - "- `[ab]` - Group of multiple possible characters - in this case `a` or `b`\n", - "- `[a-z]` matches any character from a to z\n", - "- `[A-Z]` matches any character from A to Z\n", - "- `[A-Zab]` matches any character from A to Z (`A-Z`), *or* `a` *or* `b`\n", - "\n", - "**Repeating**\n", - "\n", - "*'greedy' means that it captures as much as it can, 'lazy' means it captures as little as possible.*\n", - "`?` - One or zero of the preceding match\n", - "- `+` - One or more of the preceding match (greedy)\n", - "- `*` - Zero or more of the preceding match (greedy)\n", - "- `*?`, `+?` - Lazy versions of `*` and `+`\n", - "\n", - "**Position**\n", - "- `^` - Start of line\n", - "- `$` - End of line\n", - "\n", - "*Questions*\n", - "\n", - "- Q2a: Write the expression that matches the period at the end of a sentence and the first word of the next sentence. The matches should be: `['. The', '. Although']` (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "GCkA1uQIwEfX" - }, - "outputs": [], - "source": [ - "q2a_answer = '' #@param {type:'string'}\n", - "re.findall(q2a_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "hmv4DTKNwEfa" - }, - "source": [ - "- Q2b: Write the expression to match the two words following the word `a`. The results should be `['a deadly earthquake', 'a young bear', 'a new home']` (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "VSi2v7rmwEfb" - }, - "outputs": [], - "source": [ - "q2b_answer = '' #@param {type:'string'}\n", - "re.findall(q2b_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "CgKT_nK5wEfe" - }, - "source": [ - "- Q2c: Write the expression to match all words with an `'s`. In this case, `Paddington's` is the only match. (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "7zbb7pzCwEff" - }, - "outputs": [], - "source": [ - "q2c_answer = '' #@param {type:'string'}\n", - "re.findall(q2c_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "ZJNrE3OOwEfj" - }, - "source": [ - "- Q2d: Write the expression to match all values in parentheses. The results should be `['(Ben Whishaw)', '(Hugh Bonneville)', '(Sally Hawkins)', '(Nicole Kidman)]'` (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "pLz1VYtjwEfk" - }, - "outputs": [], - "source": [ - "q2d_answer = '' #@param {type:'string'}\n", - "re.findall(q2d_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "a0WWNmnwwEfn" - }, - "source": [ - "- Q2e: Write the expression to match all capitalized words, including `Paddington's`. (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "E7Z2tH1awEfo" - }, - "outputs": [], - "source": [ - "q2e_answer = '' #@param {type:'string'}\n", - "re.findall(q2e_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "IszncrdAwEfr" - }, - "source": [ - "- Q2f: Write the expression to match all pairings of capitalized words, like `Mary Brown` and `Although Paddington`. (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "xFrn1WuNwEfs" - }, - "outputs": [], - "source": [ - "q2f_answer = '' #@param {type:'string'}\n", - "re.findall(q2f_answer, synopsis)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "BjGW8Q2zwEfv" - }, - "source": [ - "## Other Python Regular Expression Functions\n", - "\n", - "**Replace a pattern**:\n", - " \n", - "```\n", - "re.sub(pattern, repl, string)\n", - "```\n", - "\n", - "This replaces the pattern matches with whatever string you provide to `repl`.\n", - "\n", - "**Check a match**:\n", - " \n", - "```\n", - "regex.search(string)\n", - "```\n", - "\n", - "This doesn't return anything if there are no matches, so it's useful for asking, \"is there a match or not?\"" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "vx7lh1DjwEfw" - }, - "source": [ - "## Pandas\n", - "\n", - "Load the following data into a DataFrame, containing the tweets of Lord_Voldemort7:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "colab": { - "base_uri": "https://localhost:8080/", - "height": 195 - }, - "id": "Ou_n26JWwEfx", - "outputId": "a9ab9400-f94d-4163-86df-664486e096e6" - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idfavouritesretweetscreated_attextis_quoted
090366428941168640019215562017-09-01 17:02:10#19YearsLater #BackToHogwartsFalse
190121755986070323425569342017-08-25 22:59:44\"I rose up from the dead, I do it all the time...False
2879505014422740994387915782017-06-27 01:01:50#HarryPotter20 There will always be magic.False
38182933426226503687371692017-01-09 03:08:30Hermione tries to play Quidditch but hits her ...False
4815434344357625856271910122017-01-01 05:47:522016 was channeling The Little Mermaid and end...False
\n", - "
" + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "view-in-github", + "colab_type": "text" + }, + "source": [ + "\"Open" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Zuf8EoDzwEem" + }, + "source": [ + "# Week 8 Lab: Regular Expressions" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "UquVNqG4wEeo" + }, + "outputs": [], + "source": [ + "import re\n", + "import pandas as pd" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "OXHoXToPwEer" + }, + "source": [ + "Here is a synopsis of the best film of our time, *Paddington*:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 69 + }, + "id": "PhMVBwhpwEes", + "outputId": "ebb331af-4ec3-4039-a2af-4fc4bbc37802" + }, + "outputs": [ + { + "data": { + "application/vnd.google.colaboratory.intrinsic+json": { + "type": "string" + }, + "text/plain": [ + "'After a deadly earthquake destroys his home in Peruvian rainforest, a young bear (Ben Whishaw) makes his way to England in search of a new home. The bear, dubbed \"Paddington\" for the london train station, finds shelter with the family of Henry (Hugh Bonneville) and Mary Brown (Sally Hawkins). Although Paddington\\'s amazement at urban living soon endears him to the Browns, someone else has her eye on him: Taxidermist Millicent Clyde (Nicole Kidman) has designs on the rare bear and his hide.'" + ] + }, + "execution_count": 1, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "synopsis = '''After a deadly earthquake destroys his home in the Peruvian rainforest, a young bear (Ben Whishaw) makes his way to England in search of a new home. The bear, dubbed \"Paddington\" for the london train station, finds shelter with the family of Henry (Hugh Bonneville) and Mary Brown (Sally Hawkins). Although Paddington's amazement at urban living soon endears him to the Browns, someone else has her eye on him: Taxidermist Millicent Clyde (Nicole Kidman) has designs on the rare bear and his hide.'''\n", + "synopsis" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "mzj3T_AUwEez" + }, + "source": [ + "For these questions, use `re.findall(pattern, text)`, like so:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 34 + }, + "id": "J6oZ8HVcwEe0", + "outputId": "77adfe21-3c6f-4443-d744-0b3a75b5daf1" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "['After ', 'Henry ', 'Brown ', 'Sally ', 'Clyde ']" + ] + }, + "execution_count": 6, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "re.findall('[A-Z]....\\s', synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "NNRlqYqh8j_h" + }, + "source": [ + "For many of the questions, I'll ask you for the `pattern` that goes into that search." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IqAdp-3iwEe5" + }, + "source": [ + "### Example Question\n", + "\n", + "- Q0.0: Write the regular expression to find all three or four-letter words:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "Ub9b8TUEwEe6", + "outputId": "a915df70-cf14-497f-db87-3e4334eff5b2" + }, + "outputs": [ + { + "data": { + "text/plain": [ + "[' his ',\n", + " ' bear ',\n", + " ' his ',\n", + " ' new ',\n", + " ' The ',\n", + " ' for ',\n", + " ' with ',\n", + " ' and ',\n", + " ' soon ',\n", + " ' him ',\n", + " ' the ',\n", + " ' else ',\n", + " ' her ',\n", + " ' has ',\n", + " ' the ',\n", + " ' bear ',\n", + " ' his ']" + ] + }, + "execution_count": 5, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "#@markdown Run this cell to see if it works\n", + "q0_answer = '\\s\\w\\w\\w\\w?\\s' #@param {type:'string'}\n", + "re.findall(q0_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dVtGDijDwEfE" + }, + "source": [ + "## Questions #1: Matching Characters (20 pts)\n", + "\n", + "- Q1a: Write a regular expression to match all three-character words surrounded by spaces. You need to change the answer in the form field and run it to see if it works. (5pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ZIIqEYxXwEfJ", + "scrolled": true + }, + "outputs": [], + "source": [ + "#@markdown *Run this cell to see if it works*\n", + "q1a_answer = '' #@param {type:'string'}\n", + "re.findall(q1a_answer, synopsis) # This runs your pattern" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "esCPsEnjwEfM" + }, + "source": [ + "- Q1b: Write a regular expression to match all three-character words, allowing for instances like `(Ben` and `him:`. (5pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "qt8_JQnxwEfM" + }, + "outputs": [], + "source": [ + "#@markdown *Run this cell to see if it works*\n", + "q1b_answer = '' #@param {type:'string'}\n", + "re.findall(q1b_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "AuFhWEX-wEfQ" + }, + "source": [ + "- Q1c: Write the regular expression to determine how many times the words `the` or `The` show up. (5pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "R5LxSp3ewEfQ" + }, + "outputs": [], + "source": [ + "q1c_answer = '' #@param {type:'string'}\n", + "matches = re.findall(q1c_answer, synopsis)\n", + "print(matches)\n", + "print(\"Count\", len(matches))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KSxzA382wEfT" + }, + "source": [ + "- Q1d: Write a regular expression to find the context at the end of a sentence: the five characters leading up to the period. Results should be `[' home.', 'kins).', ' hide.']` (5pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "seEtUQAFwEfU" + }, + "outputs": [], + "source": [ + "q1d_answer = '' #@param {type:'string'}\n", + "re.findall(q1d_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "dTJiDjSpwEfX" + }, + "source": [ + "## Questions #2: Repeating Patterns (36 pts)\n", + "\n", + "*Full Reference*\n", + "\n", + "**Matching characters**\n", + "- `a` - Match the letter `a`. Same for most other characters\n", + "- `.` - Match any single character\n", + "- `\\w` - Match any word character (letters, number... support for non-English characters varies)\n", + "- `\\W` - Match any non-word characters\n", + "- `\\d` - Match any digit\n", + "- `.` - Matches *any* character\n", + "- `\\.` - Matches a literal period (or `\\\\` matches literal backslash, `\\(` matches literal parenthesis, etc)\n", + "- `\\s` - Match any whitespace character (space, tabs, line breaks sometimes)\n", + "\n", + "**Multiple Matches**\n", + "- `[ab]` - Group of multiple possible characters - in this case `a` or `b`\n", + "- `[a-z]` matches any character from a to z\n", + "- `[A-Z]` matches any character from A to Z\n", + "- `[A-Zab]` matches any character from A to Z (`A-Z`), *or* `a` *or* `b`\n", + "\n", + "**Repeating**\n", + "\n", + "*'greedy' means that it captures as much as it can, 'lazy' means it captures as little as possible.*\n", + "`?` - One or zero of the preceding match\n", + "- `+` - One or more of the preceding match (greedy)\n", + "- `*` - Zero or more of the preceding match (greedy)\n", + "- `*?`, `+?` - Lazy versions of `*` and `+`\n", + "\n", + "**Position**\n", + "- `^` - Start of line\n", + "- `$` - End of line\n", + "\n", + "*Questions*\n", + "\n", + "- Q2a: Write the expression that matches the period at the end of a sentence and the first word of the next sentence. The matches should be: `['. The', '. Although']` (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "GCkA1uQIwEfX" + }, + "outputs": [], + "source": [ + "q2a_answer = '' #@param {type:'string'}\n", + "re.findall(q2a_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hmv4DTKNwEfa" + }, + "source": [ + "- Q2b: Write the expression to match the two words following the word `a`. The results should be `['a deadly earthquake', 'a young bear', 'a new home']` (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "VSi2v7rmwEfb" + }, + "outputs": [], + "source": [ + "q2b_answer = '' #@param {type:'string'}\n", + "re.findall(q2b_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "CgKT_nK5wEfe" + }, + "source": [ + "- Q2c: Write the expression to match all words with an `'s`. In this case, `Paddington's` is the only match. (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "7zbb7pzCwEff" + }, + "outputs": [], + "source": [ + "q2c_answer = '' #@param {type:'string'}\n", + "re.findall(q2c_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "ZJNrE3OOwEfj" + }, + "source": [ + "- Q2d: Write the expression to match all values in parentheses. The results should be `['(Ben Whishaw)', '(Hugh Bonneville)', '(Sally Hawkins)', '(Nicole Kidman)]'` (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "pLz1VYtjwEfk" + }, + "outputs": [], + "source": [ + "q2d_answer = '' #@param {type:'string'}\n", + "re.findall(q2d_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "a0WWNmnwwEfn" + }, + "source": [ + "- Q2e: Write the expression to match all capitalized words, including `Paddington's`. (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "E7Z2tH1awEfo" + }, + "outputs": [], + "source": [ + "q2e_answer = '' #@param {type:'string'}\n", + "re.findall(q2e_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "IszncrdAwEfr" + }, + "source": [ + "- Q2f: Write the expression to match all pairings of capitalized words, like `Mary Brown` and `Although Paddington`. (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "xFrn1WuNwEfs" + }, + "outputs": [], + "source": [ + "q2f_answer = '' #@param {type:'string'}\n", + "re.findall(q2f_answer, synopsis)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "BjGW8Q2zwEfv" + }, + "source": [ + "## Other Python Regular Expression Functions\n", + "\n", + "**Replace a pattern**:\n", + " \n", + "```\n", + "re.sub(pattern, repl, string)\n", + "```\n", + "\n", + "This replaces the pattern matches with whatever string you provide to `repl`.\n", + "\n", + "**Check a match**:\n", + " \n", + "```\n", + "regex.search(string)\n", + "```\n", + "\n", + "This doesn't return anything if there are no matches, so it's useful for asking, \"is there a match or not?\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "vx7lh1DjwEfw" + }, + "source": [ + "## Pandas\n", + "\n", + "Load the following data into a DataFrame, containing the tweets of Lord_Voldemort7:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 195 + }, + "id": "Ou_n26JWwEfx", + "outputId": "a9ab9400-f94d-4163-86df-664486e096e6" + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfavouritesretweetscreated_attextis_quoted
090366428941168640019215562017-09-01 17:02:10#19YearsLater #BackToHogwartsFalse
190121755986070323425569342017-08-25 22:59:44\"I rose up from the dead, I do it all the time...False
2879505014422740994387915782017-06-27 01:01:50#HarryPotter20 There will always be magic.False
38182933426226503687371692017-01-09 03:08:30Hermione tries to play Quidditch but hits her ...False
4815434344357625856271910122017-01-01 05:47:522016 was channeling The Little Mermaid and end...False
\n", + "
" + ], + "text/plain": [ + " id ... is_quoted\n", + "0 903664289411686400 ... False\n", + "1 901217559860703234 ... False\n", + "2 879505014422740994 ... False\n", + "3 818293342622650368 ... False\n", + "4 815434344357625856 ... False\n", + "\n", + "[5 rows x 6 columns]" + ] + }, + "execution_count": 11, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } + ], + "source": [ + "tweets = pd.read_csv('https://raw.githubusercontent.com/organisciak/Scripting-Course/master/data/voldemort_tweets.csv')\n", + "tweets.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Ndbd_zOUwEfz" + }, + "source": [ + "String actions on a DataFrame column (that is, a Series) are accessed with:\n", + "\n", + "```\n", + "df['columnName'].str\n", + "```\n", + "\n", + "For matching on regular expressions, you can use `.str.contains()`. For examples:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/", + "height": 166 + }, + "id": "JhDkKIGgwEf0", + "outputId": "4b0f64ef-78aa-45d0-a17f-a9c9a04224db", + "scrolled": true + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idfavouritesretweetscreated_attextis_quoted
721300324033327661056159234312013-02-09 19:23:23#ThingsYouShouldntDo: Tell Harry Potter that y...False
5493318328949160263694408022013-05-07 18:08:21If looks could kill then my body would be a de...False
21721336999084180193283228842011-11-08 00:18:36\"Over my dead body\" is trending... http://t.co...False
184616091748581258444828916932012-01-22 02:51:32First sources said Joe Paterno was dead. Now t...False
\n", + "
" + ], + "text/plain": [ + " id ... is_quoted\n", + "721 300324033327661056 ... False\n", + "549 331832894916026369 ... False\n", + "2172 133699908418019328 ... False\n", + "1846 160917485812584448 ... False\n", + "\n", + "[4 rows x 6 columns]" + ] + }, + "execution_count": 21, + "metadata": { + "tags": [] + }, + "output_type": "execute_result" + } ], - "text/plain": [ - " id ... is_quoted\n", - "0 903664289411686400 ... False\n", - "1 901217559860703234 ... False\n", - "2 879505014422740994 ... False\n", - "3 818293342622650368 ... False\n", - "4 815434344357625856 ... False\n", - "\n", - "[5 rows x 6 columns]" - ] - }, - "execution_count": 11, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" + "source": [ + "matches = tweets['text'].str.contains('dead')\n", + "tweets[matches].sample(4)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "KHKiYQ6jwEf2" + }, + "source": [ + "*Tip*: The tweet text will be cutoff by default when print. You can change the width of columns to show the entire tweet with the following code:" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "97VgtzEnwEf3" + }, + "outputs": [], + "source": [ + "pd.set_option(\"display.max_colwidth\", 160)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "hCfqr3LdwEf5" + }, + "source": [ + "## Questions 3 (24pts)\n", + "\n", + "*Unless otherwise specified, return all columns when asked to return tweets; e.g. don't remove the id or retweets columns.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "_9TTHrs1wEf6" + }, + "source": [ + "- Q3a. Write the pattern to find all the tweets that mention \"Harry Potter\". (4 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "GTti7e-twEf6" + }, + "outputs": [], + "source": [ + "q3a_answer = '' #@param {type:'string'}\n", + "matches = tweets['text'].str.contains(q3a_answer)\n", + "tweets[matches]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "WXuc2aWnwEf-" + }, + "source": [ + "- Q3b. Write the pattern to return the tweets that have a hashtag (assuming only word characters in hashtags). There should be 1432 rows. (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "199e5AkdwEf-" + }, + "outputs": [], + "source": [ + "q3b_answer = '' #@param {type:'string'}\n", + "matches = tweets['text'].str.contains(q3b_answer)\n", + "tweets[matches]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "VG7bkS7owEgH" + }, + "source": [ + "- Q3c. Write the pattern to return the tweets that have at least *two* hashtags (assuming only word characters in hashtags). There should be either 190 rows or 166 rows (depending on how you interpreted the problem). (8 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "1v0C3jl3wEgI" + }, + "outputs": [], + "source": [ + "q3c_answer = '' #@param {type:'string'}\n", + "matches = tweets['text'].str.contains(q3c_answer)\n", + "tweets[matches]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "6Qkab5LQwEgK" + }, + "source": [ + "- Q3d. Write the pattern to return the tweets that mention a year in the current millenium (or something that looks like one). (6 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ORjv0yqNwEgL" + }, + "outputs": [], + "source": [ + "q3d_answer = '' #@param {type:'string'}\n", + "matches = tweets['text'].str.contains(q3d_answer)\n", + "tweets[matches]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "-LqHFbXPwEgN" + }, + "source": [ + "## Some more Pandas string methods\n", + "\n", + "Counting matches in a column:\n", + "\n", + "```\n", + "df['columnName'].str.count(pat)\n", + "```\n", + "\n", + "Replacing a pattern with a string:\n", + "\n", + "```\n", + "df['columnName'].str.replace(pat, repl)\n", + "```" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "kJGxCqQKwEgO" + }, + "source": [ + "## Questions 4: Other string methods and non-regex Pandas practice (20 pts)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "sxPB7bftwEgO" + }, + "source": [ + "- Q4a Return the 20 tweets with the most favourites. (5 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "id": "M3BEeekXwEgP", + "scrolled": true + }, + "outputs": [], + "source": [ + "# Answer-Q4a\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "00D18Di4wEgT" + }, + "source": [ + "- Q4b How many total favourites do this account's first 100 tweets ever have? (5 pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "lGF9yvphwEgT" + }, + "outputs": [], + "source": [ + "q4b_answer = \"\" #@param {type:'string'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "Yar0FSsnwEgW" + }, + "source": [ + "- Q4c What is the id of the tweet matching the following criteria: has the lowest retweet-to-favourites ratio, among tweets *without hashtags* and *with more than 1000 favourites*. (10pts)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "vzpJJuL5wEgX" + }, + "outputs": [], + "source": [ + "q4c_answer = \"\" #@param {type:'string'}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "l_qJtoTnQmAG" + }, + "source": [ + "## Submission Instructions\n", + "\n", + "Only one person from your group needs to submit the assignment." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "cellView": "form", + "id": "ZUaRz9ncQc4V" + }, + "outputs": [], + "source": [ + "#@markdown ### Enter your group member names for grading\n", + "my_name = \"\" #@param { type:'string' }\n", + "\n", + "#@markdown _Have you saved your work for yourself? Don't forget to Save a Copy in Drive so that you have your progress._" + ] } - ], - "source": [ - "tweets = pd.read_csv('https://raw.githubusercontent.com/organisciak/Scripting-Course/master/data/voldemort_tweets.csv')\n", - "tweets.head()" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "Ndbd_zOUwEfz" - }, - "source": [ - "String actions on a DataFrame column (that is, a Series) are accessed with:\n", - "\n", - "```\n", - "df['columnName'].str\n", - "```\n", - "\n", - "For matching on regular expressions, you can use `.str.contains()`. For examples:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { + ], + "metadata": { "colab": { - "base_uri": "https://localhost:8080/", - "height": 166 - }, - "id": "JhDkKIGgwEf0", - "outputId": "4b0f64ef-78aa-45d0-a17f-a9c9a04224db", - "scrolled": true - }, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idfavouritesretweetscreated_attextis_quoted
721300324033327661056159234312013-02-09 19:23:23#ThingsYouShouldntDo: Tell Harry Potter that y...False
5493318328949160263694408022013-05-07 18:08:21If looks could kill then my body would be a de...False
21721336999084180193283228842011-11-08 00:18:36\"Over my dead body\" is trending... http://t.co...False
184616091748581258444828916932012-01-22 02:51:32First sources said Joe Paterno was dead. Now t...False
\n", - "
" - ], - "text/plain": [ - " id ... is_quoted\n", - "721 300324033327661056 ... False\n", - "549 331832894916026369 ... False\n", - "2172 133699908418019328 ... False\n", - "1846 160917485812584448 ... False\n", - "\n", - "[4 rows x 6 columns]" - ] - }, - "execution_count": 21, - "metadata": { - "tags": [] - }, - "output_type": "execute_result" + "name": "08-lab-regular-expressions.ipynb", + "provenance": [], + "include_colab_link": true + }, + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.10" } - ], - "source": [ - "matches = tweets['text'].str.contains('dead')\n", - "tweets[matches].sample(4)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "KHKiYQ6jwEf2" - }, - "source": [ - "*Tip*: The tweet text will be cutoff by default when print. You can change the width of columns to show the entire tweet with the following code:" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "97VgtzEnwEf3" - }, - "outputs": [], - "source": [ - "pd.set_option(\"display.max_colwidth\", 160)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "hCfqr3LdwEf5" - }, - "source": [ - "## Questions 3 (24pts)\n", - "\n", - "*Unless otherwise specified, return all columns when asked to return tweets; e.g. don't remove the id or retweets columns.*" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "_9TTHrs1wEf6" - }, - "source": [ - "- Q3a. Write the pattern to find all the tweets that mention \"Harry Potter\". (4 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "GTti7e-twEf6" - }, - "outputs": [], - "source": [ - "q3a_answer = '' #@param {type:'string'}\n", - "matches = tweets['text'].str.contains(q3a_answer)\n", - "tweets[matches]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "WXuc2aWnwEf-" - }, - "source": [ - "- Q3b. Write the pattern to return the tweets that have a hashtag (assuming only word characters in hashtags). There should be 1432 rows. (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "199e5AkdwEf-" - }, - "outputs": [], - "source": [ - "q3b_answer = '' #@param {type:'string'}\n", - "matches = tweets['text'].str.contains(q3b_answer)\n", - "tweets[matches]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "VG7bkS7owEgH" - }, - "source": [ - "- Q3c. Write the pattern to return the tweets that have at least *two* hashtags (assuming only word characters in hashtags). There should be 190 rows. (8 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "1v0C3jl3wEgI" - }, - "outputs": [], - "source": [ - "q3c_answer = '' #@param {type:'string'}\n", - "matches = tweets['text'].str.contains(q3c_answer)\n", - "tweets[matches]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "6Qkab5LQwEgK" - }, - "source": [ - "- Q3d. Write the pattern to return the tweets that mention a year in the current millenium (or something that looks like one). (6 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ORjv0yqNwEgL" - }, - "outputs": [], - "source": [ - "q3d_answer = '' #@param {type:'string'}\n", - "matches = tweets['text'].str.contains(q3d_answer)\n", - "tweets[matches]" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "-LqHFbXPwEgN" - }, - "source": [ - "## Some more Pandas string methods\n", - "\n", - "Counting matches in a column:\n", - "\n", - "```\n", - "df['columnName'].str.count(pat)\n", - "```\n", - "\n", - "Replacing a pattern with a string:\n", - "\n", - "```\n", - "df['columnName'].str.replace(pat, repl)\n", - "```" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "kJGxCqQKwEgO" - }, - "source": [ - "## Questions 4: Other string methods and non-regex Pandas practice (20 pts)" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "sxPB7bftwEgO" - }, - "source": [ - "- Q4a Return the 20 tweets with the most favourites. (5 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "id": "M3BEeekXwEgP", - "scrolled": true - }, - "outputs": [], - "source": [ - "# Answer-Q4a\n" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "00D18Di4wEgT" - }, - "source": [ - "- Q4b How many total favourites do this account's first 100 tweets ever have? (5 pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "lGF9yvphwEgT" - }, - "outputs": [], - "source": [ - "q4b_answer = \"\" #@param {type:'string'}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "Yar0FSsnwEgW" - }, - "source": [ - "- Q4c What is the id of the tweet matching the following criteria: has the lowest retweet-to-favourites ratio, among tweets *without hashtags* and *with more than 1000 favourites*. (10pts)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "vzpJJuL5wEgX" - }, - "outputs": [], - "source": [ - "q4c_answer = \"\" #@param {type:'string'}" - ] - }, - { - "attachments": {}, - "cell_type": "markdown", - "metadata": { - "id": "l_qJtoTnQmAG" - }, - "source": [ - "## Submission Instructions\n", - "\n", - "Only one person from your group needs to submit the assignment." - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "cellView": "form", - "id": "ZUaRz9ncQc4V" - }, - "outputs": [], - "source": [ - "#@markdown ### Enter your group member names for grading\n", - "my_name = \"\" #@param { type:'string' }\n", - "\n", - "#@markdown _Have you saved your work for yourself? Don't forget to Save a Copy in Drive so that you have your progress._" - ] - } - ], - "metadata": { - "colab": { - "include_colab_link": true, - "name": "08-lab-regular-expressions.ipynb", - "provenance": [] - }, - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.10" - } - }, - "nbformat": 4, - "nbformat_minor": 1 -} + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file