-
-
Notifications
You must be signed in to change notification settings - Fork 19k
Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
my_dict = my_dict = {'content': {'0': '{\n "metadata": {\n "name": ""\n },\n "nbformat": 3,\n "nbformat_minor": 0,\n "worksheets": [\n {\n "cells": [\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "import klustaviewa.dataio as kvio"\n ],\n "language": "python",\n "metadata": {},\n "outputs": [],\n "prompt_number": 1\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "fet_filename = r\\"D:\\\\Spike sorting\\\\sirota\\\\ec016.694_711.fet.7\\"\\n",\n "spk_filename = r\\"D:\\\\Spike sorting\\\\sirota\\\\ec016.694_711.spk.7\\""\n ],\n "language": "python",\n "metadata": {},\n "outputs": [],\n "prompt_number": 2\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "fet = kvio.MemMappedText(fet_filename, np.int16, skiprows=1)"\n ],\n "language": "python",\n "metadata": {},\n "outputs": [],\n "prompt_number": 3\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "%timeit -n 100 -r 1 [fet.next() for _ in xrange(1000)]"\n ],\n "language": "python",\n "metadata": {},\n "outputs": [\n {\n "output_type": "stream",\n "stream": "stdout",\n "text": [\n "100 loops, best of 1: 3.62 ms per loop\\n"\n ]\n }\n ],\n "prompt_number": 4\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "spk = kvio.MemMappedBinary(spk_filename, np.int16, 89*32)"\n ],\n "language": "python",\n "metadata": {},\n "outputs": [],\n "prompt_number": 5\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [\n "%timeit -n 100 -r 1 [spk.next() for _ in xrange(1000)]"\n ],\n "language": "python",\n "metadata": {},\n "outputs": [\n {\n "output_type": "stream",\n "stream": "stdout",\n "text": [\n "100 loops, best of 1: 10.5 ms per loop\\n"\n ]\n }\n ],\n "prompt_number": 6\n },\n {\n "cell_type": "code",\n "collapsed": false,\n "input": [],\n "language": "python",\n "metadata": {},\n "outputs": []\n }\n ],\n "metadata": {}\n }\n ]\n}',
'1': '{\n "cells": [\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "# Scraping Reviews #\\n",\n "\\n",\n "This notebook shows how to use the scrape reviews from Indeed and Glassdoor. To visualize the ratings go to the [Ratings](Ratings.html) notebook and to do topic modeling go to the [Topic Modeling](Topic-Modeling.html) notebook.\\n",\n "\\n",\n "Before, make sure you have MongoDB up and running."\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Parameters ##"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": 1,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "# Search settings\\n",\n "KEYWORD_FILTER = \\"Data Scientist\\"\\n",\n "LOCATION_FILTER = \\"New York City, NY\\"\\n",\n "\\n",\n "# Other settings\\n",\n "MAX_PAGES_COMPANIES = 500\\n",\n "MAX_PAGES_REVIEWS = 500"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": 14,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "import os\\n",\n "import re\\n",\n "from datetime import datetime\\n",\n "from pymongo import MongoClient\\n",\n "import indeed\\n",\n "import glassdoor\\n",\n "import utils"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": 4,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "# DB settings\\n",\n "client = MongoClient()\\n",\n "indeed_db = client.indeed\\n",\n "indeed_jobs = indeed_db.jobs\\n",\n "indeed_reviews = indeed_db.reviews\\n",\n "glassdoor_db = client.glassdoor\\n",\n "glassdoor_reviews = glassdoor_db.reviews"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Scrape job listings from Indeed ##"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "jobs = indeed.get_jobs(KEYWORD_FILTER, LOCATION_FILTER, indeed_jobs, MAX_PAGES_COMPANIES)"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Scrape company reviews from Indeed ##\\n",\n "\\n",\n "This takes all the companies that appear in the jobs scraped."\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "indeed.get_all_company_reviews(jobs, indeed_reviews, MAX_PAGES_REVIEWS)"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": 5,\n "metadata": {\n "collapsed": false\n },\n "outputs": [\n {\n "data": {\n "text/plain": [\n "{u\'_id\': ObjectId(\'54f763e3bcccd9197dbbdb91\'),\\n",\n " u\'company_name\': u\'American Express\',\\n",\n " u\'date\': datetime.datetime(2013, 4, 3, 0, 0),\\n",\n " u\'employment_status\': u\'\\\\xa0(Former Employee),\\\\xa0\',\\n",\n " u\'job_title\': u\'Shipping Clerk\',\\n",\n " u\'location\': u\'Piscataway, New Jersey\',\\n",\n " u\'rating\': u\'5.0\',\\n",\n " u\'review_cons\': u\'Cons: long hours doing the christmas season\',\\n",\n " u\'review_pros\': u\'Pros: you are able to apply for a credit card\',\\n",\n " u\'review_text\': u\'If you are looking for a job to retire from and the work is not hard,then American Express is that company.\',\\n",\n " u\'review_title\': u\'A Company with a future\',\\n",\n " u\'stars\': {u\'Compensation/Benefits\': 5,\\n",\n " u\'Job Culture\': 5,\\n",\n " u\'Job Security/Advancement\': 5,\\n",\n " u\'Job Work/Life Balance\': 5,\\n",\n " u\'Management\': 5}}"\n ]\n },\n "execution_count": 5,\n "metadata": {},\n "output_type": "execute_result"\n }\n ],\n "source": [\n "indeed_reviews.find_one()"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "# Fix Company Names #\\n",\n "\\n",\n "Indeed\'s company names are inconsistent. The same company can be listed several times with various spellings/typos/words. It\'s necessary to look at the companies and fix the names. The utils module has a function which takes a dictionary that takes the old name and returns the new one (names not in the dictionary are left as is). See below for an example (the one I used had over 30 name fixes."\n ]\n },\n {\n "cell_type": "code",\n "execution_count": 17,\n "metadata": {\n "collapsed": false\n },\n "outputs": [\n {\n "data": {\n "text/plain": [\n "[u\'Financial Times\',\\n",\n " u\'McGraw Hill Financial\',\\n",\n " u\'The Nielsen Company\',\\n",\n " u\'Continuum\',\\n",\n " u\'RUSSELL INVESTMENTS\']"\n ]\n },\n "execution_count": 17,\n "metadata": {},\n "output_type": "execute_result"\n }\n ],\n "source": [\n "companies = list(set(utils.get_company_names(indeed_reviews)))\\n",\n "companies[:5]"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "fix_companies = {\'Argus, ISO, Verisk Analytics, Verisk Climate, Veri...\': \'Verisk Analytics\',\\n",\n " \'Barclays Investment Bank\': \'Barclays\', \'Dun & Brandstreet\': u\'Dun & Bradstreet\', \\n",\n " \'Dun & Broadstreet\':u\'Dun & Bradstreet\', \'World Business Lenders - New York, NY\':\'World Business Lenders\'\\n",\n " }\\n",\n "utils.fix_all_company_names(indeed_reviews, fix_companies)"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "companies = list(set(utils.get_company_names(indeed_reviews)))"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Scrape Glassdoor ##"\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "visited_companies, failed_companies = glassdoor.get_all_company_reviews(companies, \\n",\n " glassdoor_reviews, MAX_PAGES_REVIEWS)"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Final Fixes ##\\n",\n "\\n",\n "Look at the failed companies. Often they couldn\'t be found on glassdoor because of an issue with their name. You might need to fix the names again (and search on glassdoor for the name some companies are listed under). Beware of encoding issues: if you pass an optional flag to utils.fix_company_name, you can encode the company names to ascii.\\n",\n "\\n",\n "Note: this is usually quite a bit slower than Indeed because there are many more reviews (e.g. Goldman Sachs has 198 pages!)."\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "# fix_companies = {u\'SigmaTek\':u\'SigmaTek Consulting LLC\',\\n",\n "# }\\n",\n "# utils.fix_all_company_names(indeed_reviews, fix_companies)\\n",\n "# fixed_failed_companies = fixed_failed_companies = [utils.fix_company_name(company,\\n",\n "# fix_companies, True) for company in failed_companies]\\n",\n "# visited_companies2, failed_companies = glassdoor.get_all_company_reviews(fixed_failed_companies, \\n",\n "# glassdoor_reviews, MAX_PAGES_REVIEWS)"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "Here I would do one last check too see which companies were scraped in glassdoor and indeed. Occasionally the wrong company might have been scraped on glassdoor."\n ]\n },\n {\n "cell_type": "code",\n "execution_count": null,\n "metadata": {\n "collapsed": false\n },\n "outputs": [],\n "source": [\n "glassdoor_companies = set(utils.get_company_names(glassdoor_reviews))\\n",\n "indeed_companies = set(utils.get_company_names(indeed_reviews))\\n",\n "\\n",\n "# Remove the extra companies:\\n",\n "extra_companies = glassdoor_companies - indeed_companies\\n",\n "for company in extra_companies:\\n",\n " glassdoor_reviews.remove({\'company\' : company})\\n",\n "\\n",\n "print \\"Missing companies\\", indeed_companies - glassdoor_companies"\n ]\n },\n {\n "cell_type": "markdown",\n "metadata": {},\n "source": [\n "## Done! ##\\n",\n "Now all of the data is in the Mongo database. To visualize the ratings go to the [Ratings](Ratings.ipynb) notebook and to do topic modeling go to the [Topic Modeling](Topic Modeling.ipynb) notebook."\n ]\n }\n ],\n "metadata": {\n "kernelspec": {\n "display_name": "Python 2",\n "language": "python",\n "name": "python2"\n },\n "language_info": {\n "codemirror_mode": {\n "name": "ipython",\n "version": 2\n },\n "file_extension": ".py",\n "mimetype": "text/x-python",\n "name": "python",\n "nbconvert_exporter": "python",\n "pygments_lexer": "ipython2",\n "version": "2.7.9"\n }\n },\n "nbformat": 4,\n "nbformat_minor": 0\n}\n'},
'language': {'0': 'Jupyter Notebook', '1': 'Jupyter Notebook'}}
toy_df = pd.DataFrame.from_dict(my_dict)
toy_df.dtypes
toy_df.infer_objects().dtypes
toy_df.content.astype('str')
# Returns object dtype, not string
Issue Description
For some reason I can't convert object dtype to str in pandas dataframe.
Expected Behavior
I expect to be able to infer dtypes and/or convert object dtype to string.
Installed Versions
INSTALLED VERSIONS
commit : e8093ba
python : 3.8.9.final.0
python-bits : 64
OS : Darwin
OS-release : 21.5.0
Version : Darwin Kernel Version 21.5.0: Tue Apr 26 21:08:37 PDT 2022; root:xnu-8020.121.3~4/RELEASE_ARM64_T6000
machine : arm64
processor : arm
byteorder : little
LC_ALL : en_US.UTF-8
LANG : en_US.UTF-8
LOCALE : en_US.UTF-8
pandas : 1.4.3
numpy : 1.23.0
pytz : 2022.1
dateutil : 2.8.2
setuptools : 63.1.0
pip : 22.1.2
Cython : None
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : None
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.4.0
pandas_datareader: None
bs4 : 4.11.1
bottleneck : None
brotli : None
fastparquet : None
fsspec : None
gcsfs : None
markupsafe : 2.1.1
matplotlib : 3.5.2
numba : None
numexpr : None
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 8.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.8.1
snappy : None
sqlalchemy : None
tables : None
tabulate : None
xarray : None
xlrd : None
xlwt : None
zstandard : None